Andrew Morton wrote:
> On Mon, 3 Dec 2007 16:52:47 +0300
> "Denis V. Lunev" <[EMAIL PROTECTED]> wrote:
> 
>> There is a AB-BA deadlock regarding drop_caches sysctl. Here are the
code
>> paths:

snip...

> One way to fix jbd (and jbd2) would be:
> 
> static void __journal_temp_unlink_buffer(struct journal_head *jh,
>                                       struct buffer_head **bh_to_dirty)
> {
>       *bh_to_dirty = NULL;
>       ...
>       if (test_clear_buffer_jbddirty(bh))
>               *bh_to_dirty = bh;
> }
> 
> {
>       struct buffer_head *bh_to_dirty;        /* probably needs
uninitialized_var() */
> 
>       ...
>       __journal_temp_unlink_buffer(jh, &bh_to_dirty);
>       ...
>       jbd_mark_buffer_dirty(bh_to_dirty);
>       brelse(bh_to_dirty);
>       ...
> }
> 
> static inline void jbd_mark_buffer_dirty(struct buffer_head *bh)
> {
>       if (bh)
>               mark_buffer_dirty(bh);
> }
> 
> 
> --
Hi Andrew,
here's an implementation of your suggested approach for jbd.

Without this patch my test-case will lockup in 3 to 5 iterations, with
the patch I have completed 96+ runs.

I don't see any significant difference in write performance with this
patch applied. Subjectively I feel that my system is more responsive
when under heavy write loads but I don't have data to back that up, so
it might just be wishful thinking on my part!   

I've tested this on AMD 64x2 SMP 2.6.24-rc*

the patch is against 2.6.24-rc5.

Cheers
Richard

 fs/jbd/commit.c      |   19 +++++++++++---
 fs/jbd/transaction.c |   66 +++++++++++++++++++++++++++++++++++----------------
 include/linux/jbd.h  |   11 ++++++--
 3 files changed, 70 insertions(+), 26 deletions(-)
----
commit 5b3824aa42a07682777836814fc7d1882416c6d3
Author: Richard Kennedy <[EMAIL PROTECTED]>
Date:   Mon Dec 17 12:05:36 2007 +0000

    fix lockup in when calling drop_caches
    
    calling /proc/sys/vm/drop_caches can hang due to a AB/BA lock dependency
    between j_list_lock and the inode_lock. This patch moves the redirtying of 
the buffer head out
    from under the j_list_lock.
    
    based on a suggestion by Andrew Morton.
    
    Signed-off-by: Richard Kennedy <[EMAIL PROTECTED]>

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 610264b..9115b5d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -181,6 +181,7 @@ static void journal_submit_data_buffers(journal_t *journal,
        int locked;
        int bufs = 0;
        struct buffer_head **wbuf = journal->j_wbuf;
+       struct buffer_head *dirty_bh;
 
        /*
         * Whenever we unlock the journal and sleep, things can get added
@@ -254,7 +255,8 @@ write_out_data:
                        put_bh(bh);
                } else {
                        BUFFER_TRACE(bh, "writeout complete: unfile");
-                       __journal_unfile_buffer(jh);
+                       __journal_unfile_buffer(jh, &dirty_bh);
+                       jbd_mark_buffer_dirty(dirty_bh);
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
@@ -296,6 +298,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
+       struct buffer_head *dirty_bh;
 
        /*
         * First job: lock down the current transaction and wait for
@@ -453,7 +456,9 @@ void journal_commit_transaction(journal_t *journal)
                        continue;
                }
                if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-                       __journal_unfile_buffer(jh);
+                       struct buffer_head *dirty_bh;
+                       __journal_unfile_buffer(jh, &dirty_bh);
+                       jbd_mark_buffer_dirty(dirty_bh);
                        jbd_unlock_bh_state(bh);
                        journal_remove_journal_head(bh);
                        put_bh(bh);
@@ -833,7 +838,12 @@ restart_loop:
                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
                        __journal_insert_checkpoint(jh, commit_transaction);
                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
-                       __journal_refile_buffer(jh);
+                       __journal_refile_buffer(jh, &dirty_bh);
+                       if (dirty_bh) {
+                               spin_unlock(&journal->j_list_lock);
+                               jbd_mark_buffer_dirty(dirty_bh);
+                               spin_lock(&journal->j_list_lock);
+                       }
                        jbd_unlock_bh_state(bh);
                } else {
                        J_ASSERT_BH(bh, !buffer_dirty(bh));
@@ -845,7 +855,8 @@ restart_loop:
                         * disk and before we process the buffer on BJ_Forget
                         * list. */
                        JBUFFER_TRACE(jh, "refile or unfile freed buffer");
-                       __journal_refile_buffer(jh);
+                       /* As !jbddirty dirty_bh will always be null so we can 
ignore it */
+                       __journal_refile_buffer(jh, &dirty_bh);
                        if (!jh->b_transaction) {
                                jbd_unlock_bh_state(bh);
                                 /* needs a brelse */
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 08ff6c7..5bf6202 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,7 +26,8 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 
-static void __journal_temp_unlink_buffer(struct journal_head *jh);
+static void __journal_temp_unlink_buffer(struct journal_head *jh,
+               struct buffer_head **bh);
 
 /*
  * get_transaction: obtain a new transaction_t object.
@@ -938,6 +939,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head 
*bh)
        journal_t *journal = handle->h_transaction->t_journal;
        int need_brelse = 0;
        struct journal_head *jh;
+       struct buffer_head *dirty_bh = NULL;
 
        if (is_handle_aborted(handle))
                return 0;
@@ -1054,7 +1056,7 @@ int journal_dirty_data(handle_t *handle, struct 
buffer_head *bh)
                        /* journal_clean_data_list() may have got there first */
                        if (jh->b_transaction != NULL) {
                                JBUFFER_TRACE(jh, "unfile from commit");
-                               __journal_temp_unlink_buffer(jh);
+                               __journal_temp_unlink_buffer(jh, &dirty_bh);
                                /* It still points to the committing
                                 * transaction; move it to this one so
                                 * that the refile assert checks are
@@ -1073,7 +1075,7 @@ int journal_dirty_data(handle_t *handle, struct 
buffer_head *bh)
                if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
                        JBUFFER_TRACE(jh, "not on correct data list: unfile");
                        J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-                       __journal_temp_unlink_buffer(jh);
+                       __journal_temp_unlink_buffer(jh, &dirty_bh);
                        jh->b_transaction = handle->h_transaction;
                        JBUFFER_TRACE(jh, "file as data");
                        __journal_file_buffer(jh, handle->h_transaction,
@@ -1085,6 +1087,7 @@ int journal_dirty_data(handle_t *handle, struct 
buffer_head *bh)
        }
 no_journal:
        spin_unlock(&journal->j_list_lock);
+       jbd_mark_buffer_dirty(dirty_bh);
        jbd_unlock_bh_state(bh);
        if (need_brelse) {
                BUFFER_TRACE(bh, "brelse");
@@ -1217,6 +1220,7 @@ int journal_forget (handle_t *handle, struct buffer_head 
*bh)
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
        struct journal_head *jh;
+       struct buffer_head *dirty_bh = NULL;
        int drop_reserve = 0;
        int err = 0;
 
@@ -1269,10 +1273,12 @@ int journal_forget (handle_t *handle, struct 
buffer_head *bh)
                 */
 
                if (jh->b_cp_transaction) {
-                       __journal_temp_unlink_buffer(jh);
+                       __journal_temp_unlink_buffer(jh, &dirty_bh);
+                       jbd_mark_buffer_dirty(dirty_bh);
                        __journal_file_buffer(jh, transaction, BJ_Forget);
                } else {
-                       __journal_unfile_buffer(jh);
+                       __journal_unfile_buffer(jh, &dirty_bh);
+                       jbd_mark_buffer_dirty(dirty_bh);
                        journal_remove_journal_head(bh);
                        __brelse(bh);
                        if (!buffer_jbd(bh)) {
@@ -1507,8 +1513,10 @@ __blist_del_buffer(struct journal_head **list, struct 
journal_head *jh)
  * Generally the caller needs to re-read the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
+ * returns a buffer head that needs to be marked dirty
  */
-static void __journal_temp_unlink_buffer(struct journal_head *jh)
+static void __journal_temp_unlink_buffer(struct journal_head *jh,
+               struct buffer_head **dirty_bh)
 {
        struct journal_head **list = NULL;
        transaction_t *transaction;
@@ -1523,6 +1531,7 @@ static void __journal_temp_unlink_buffer(struct 
journal_head *jh)
        if (jh->b_jlist != BJ_None)
                J_ASSERT_JH(jh, transaction != NULL);
 
+       *dirty_bh = NULL;
        switch (jh->b_jlist) {
        case BJ_None:
                return;
@@ -1557,21 +1566,24 @@ static void __journal_temp_unlink_buffer(struct 
journal_head *jh)
        __blist_del_buffer(list, jh);
        jh->b_jlist = BJ_None;
        if (test_clear_buffer_jbddirty(bh))
-               mark_buffer_dirty(bh);  /* Expose it to the VM */
+               *dirty_bh = bh; /* bh needs to be dirtied to expose it to the 
vm */
 }
 
-void __journal_unfile_buffer(struct journal_head *jh)
+void __journal_unfile_buffer(struct journal_head *jh,
+               struct buffer_head **dirty_bh)
 {
-       __journal_temp_unlink_buffer(jh);
+       __journal_temp_unlink_buffer(jh, dirty_bh);
        jh->b_transaction = NULL;
 }
 
 void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
 {
+       struct buffer_head *bh;
        jbd_lock_bh_state(jh2bh(jh));
        spin_lock(&journal->j_list_lock);
-       __journal_unfile_buffer(jh);
+       __journal_unfile_buffer(jh, &bh);
        spin_unlock(&journal->j_list_lock);
+       jbd_mark_buffer_dirty(bh);
        jbd_unlock_bh_state(jh2bh(jh));
 }
 
@@ -1584,6 +1596,8 @@ static void
 __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 {
        struct journal_head *jh;
+       int need_brelse = 0;
+       struct buffer_head *dirty_bh = NULL;
 
        jh = bh2jh(bh);
 
@@ -1598,9 +1612,9 @@ __journal_try_to_free_buffer(journal_t *journal, struct 
buffer_head *bh)
                if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
                        /* A written-back ordered data buffer */
                        JBUFFER_TRACE(jh, "release data");
-                       __journal_unfile_buffer(jh);
+                       __journal_unfile_buffer(jh, &dirty_bh);
                        journal_remove_journal_head(bh);
-                       __brelse(bh);
+                       need_brelse = 1;
                }
        } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
                /* written-back checkpointed metadata buffer */
@@ -1608,10 +1622,13 @@ __journal_try_to_free_buffer(journal_t *journal, struct 
buffer_head *bh)
                        JBUFFER_TRACE(jh, "remove from checkpoint list");
                        __journal_remove_checkpoint(jh);
                        journal_remove_journal_head(bh);
-                       __brelse(bh);
+                       need_brelse = 1;
                }
        }
        spin_unlock(&journal->j_list_lock);
+       jbd_mark_buffer_dirty(dirty_bh);
+       if (need_brelse)
+               __brelse(bh);
 out:
        return;
 }
@@ -1702,8 +1719,10 @@ static int __dispose_buffer(struct journal_head *jh, 
transaction_t *transaction)
 {
        int may_free = 1;
        struct buffer_head *bh = jh2bh(jh);
+       struct buffer_head *dirty_bh;
 
-       __journal_unfile_buffer(jh);
+       __journal_unfile_buffer(jh, &dirty_bh);
+       jbd_mark_buffer_dirty(dirty_bh);
 
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
@@ -1956,6 +1975,7 @@ void __journal_file_buffer(struct journal_head *jh,
        struct journal_head **list = NULL;
        int was_dirty = 0;
        struct buffer_head *bh = jh2bh(jh);
+       struct buffer_head *dirty_bh;
 
        J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
        assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1978,8 +1998,10 @@ void __journal_file_buffer(struct journal_head *jh,
                        was_dirty = 1;
        }
 
-       if (jh->b_transaction)
-               __journal_temp_unlink_buffer(jh);
+       if (jh->b_transaction) {
+               __journal_temp_unlink_buffer(jh, &dirty_bh);
+               jbd_mark_buffer_dirty(dirty_bh);
+       }
        jh->b_transaction = transaction;
 
        switch (jlist) {
@@ -2041,7 +2063,8 @@ void journal_file_buffer(struct journal_head *jh,
  *
  * Called under jbd_lock_bh_state(jh2bh(jh))
  */
-void __journal_refile_buffer(struct journal_head *jh)
+void __journal_refile_buffer(struct journal_head *jh,
+               struct buffer_head **dirty_bh)
 {
        int was_dirty;
        struct buffer_head *bh = jh2bh(jh);
@@ -2052,7 +2075,7 @@ void __journal_refile_buffer(struct journal_head *jh)
 
        /* If the buffer is now unused, just drop it. */
        if (jh->b_next_transaction == NULL) {
-               __journal_unfile_buffer(jh);
+               __journal_unfile_buffer(jh, dirty_bh);
                return;
        }
 
@@ -2062,7 +2085,8 @@ void __journal_refile_buffer(struct journal_head *jh)
         */
 
        was_dirty = test_clear_buffer_jbddirty(bh);
-       __journal_temp_unlink_buffer(jh);
+       /* as we just cleared jbddirty we could safely ignore dirty_bh */
+       __journal_temp_unlink_buffer(jh, dirty_bh);
        jh->b_transaction = jh->b_next_transaction;
        jh->b_next_transaction = NULL;
        __journal_file_buffer(jh, jh->b_transaction,
@@ -2090,14 +2114,16 @@ void __journal_refile_buffer(struct journal_head *jh)
 void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 {
        struct buffer_head *bh = jh2bh(jh);
+       struct buffer_head *dirty_bh;
 
        jbd_lock_bh_state(bh);
        spin_lock(&journal->j_list_lock);
 
-       __journal_refile_buffer(jh);
+       __journal_refile_buffer(jh, &dirty_bh);
        jbd_unlock_bh_state(bh);
        journal_remove_journal_head(bh);
 
        spin_unlock(&journal->j_list_lock);
+       jbd_mark_buffer_dirty(dirty_bh);
        __brelse(bh);
 }
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index d9ecd13..8c22a33 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -835,14 +835,21 @@ struct journal_s
 
 /* Filing buffers */
 extern void journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __journal_unfile_buffer(struct journal_head *);
-extern void __journal_refile_buffer(struct journal_head *);
+extern void __journal_unfile_buffer(struct journal_head *,
+               struct buffer_head **dirty_bh);
+extern void __journal_refile_buffer(struct journal_head *,
+               struct buffer_head **dirty_bh);
 extern void journal_refile_buffer(journal_t *, struct journal_head *);
 extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
 extern void __journal_free_buffer(struct journal_head *bh);
 extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
 extern void __journal_clean_data_list(transaction_t *transaction);
 
+static inline void jbd_mark_buffer_dirty(struct buffer_head *bh)
+{
+       if (bh)
+               mark_buffer_dirty(bh);
+}
 /* Log buffer allocation */
 extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
 int journal_next_log_block(journal_t *, unsigned long *);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to