ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <[email protected]>
---
 fs/ext4/fast_commit.c | 47 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 809170d46167b..966211a3342a0 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1210,13 +1210,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 
        alloc_ctx = ext4_fc_lock(sb);
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-               inodes[i] = igrab(&iter->vfs_inode);
-               if (inodes[i])
-                       i++;
+               inodes[i++] = &iter->vfs_inode;
        }
 
        list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], 
fcd_list) {
                struct ext4_inode_info *ei;
+               struct inode *inode;
 
                if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
                        continue;
@@ -1226,12 +1225,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
                /* See the comment in ext4_fc_commit_dentry_updates(). */
                ei = list_first_entry(&fc_dentry->fcd_dilist,
                                      struct ext4_inode_info, i_fc_dilist);
+               inode = &ei->vfs_inode;
                if (!list_empty(&ei->i_fc_list))
                        continue;
 
-               inodes[i] = igrab(&ei->vfs_inode);
-               if (inodes[i])
-                       i++;
+               /*
+                * Create-only inodes may only be referenced via fcd_dilist and
+                * not appear on s_fc_q[MAIN]. They may hit the last iput while
+                * we are snapshotting, but inode eviction calls ext4_fc_del(),
+                * which waits for FC_COMMITTING to clear. Mark them 
FC_COMMITTING
+                * so the inode stays pinned and the snapshot stays valid until
+                * ext4_fc_cleanup().
+                */
+               ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+               inodes[i++] = inode;
        }
        ext4_fc_unlock(sb, alloc_ctx);
 
@@ -1241,10 +1248,6 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
                        break;
        }
 
-       for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-               if (inodes[nr_inodes])
-                       iput(inodes[nr_inodes]);
-       }
        kvfree(inodes);
        return ret;
 }
@@ -1312,8 +1315,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
        jbd2_journal_lock_updates(journal);
        /*
         * The journal is now locked. No more handles can start and all the
-        * previous handles are now drained. We now mark the inodes on the
-        * commit queue as being committed.
+        * previous handles are now drained. Snapshotting happens in this
+        * window so log writing can consume only stable snapshots without
+        * doing logical-to-physical mapping.
         */
        alloc_ctx = ext4_fc_lock(sb);
        list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1565,6 +1569,25 @@ static void ext4_fc_cleanup(journal_t *journal, int 
full, tid_t tid)
                                              struct ext4_inode_info,
                                              i_fc_dilist);
                        ext4_fc_free_inode_snap(&ei->vfs_inode);
+                       spin_lock(&ei->i_fc_lock);
+                       ext4_clear_inode_state(&ei->vfs_inode,
+                                              EXT4_STATE_FC_REQUEUE);
+                       ext4_clear_inode_state(&ei->vfs_inode,
+                                              EXT4_STATE_FC_COMMITTING);
+                       spin_unlock(&ei->i_fc_lock);
+                       /*
+                        * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+                        * visible before we send the wakeup. Pairs with 
implicit
+                        * barrier in prepare_to_wait() in ext4_fc_del().
+                        */
+                       smp_mb();
+#if (BITS_PER_LONG < 64)
+                       wake_up_bit(&ei->i_state_flags,
+                                   EXT4_STATE_FC_COMMITTING);
+#else
+                       wake_up_bit(&ei->i_flags,
+                                   EXT4_STATE_FC_COMMITTING);
+#endif
                }
                list_del_init(&fc_dentry->fcd_dilist);
 
-- 
2.53.0


Reply via email to