Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=7fb5409df092589b86cc9412d926879cb572b7f0
Commit:     7fb5409df092589b86cc9412d926879cb572b7f0
Parent:     8009f9fb3067fef6c2ca0c16f6bac786ae28639d
Author:     Jan Kara <[EMAIL PROTECTED]>
AuthorDate: Sun Feb 10 01:08:38 2008 -0500
Committer:  Theodore Ts'o <[EMAIL PROTECTED]>
CommitDate: Sun Feb 10 01:08:38 2008 -0500

    ext4: Fix Direct I/O locking
    
    We cannot start transaction in ext4_direct_IO() and just let it last
    during the whole write because dio_get_page() acquires mmap_sem which
    ranks above transaction start (e.g. because we have dependency chain
    mmap_sem->PageLock->journal_start, or because we update atime while
    holding mmap_sem) and thus deadlocks could happen. We solve the problem
    by starting a transaction separately for each ext4_get_block() call.
    
    We *could* have a problem that we allocate a block and before its data
    are written out the machine crashes and thus we expose stale data. But
    that does not happen because for hole-filling generic code falls back to
    buffered writes and for file extension, we add inode to orphan list and
    thus in case of crash, journal replay will truncate inode back to the
    original size.
    
    Signed-off-by: Jan Kara <[EMAIL PROTECTED]>
    Signed-off-by: Mingming Cao <[EMAIL PROTECTED]>
    Signed-off-by: "Theodore Ts'o" <[EMAIL PROTECTED]>
---
 fs/ext4/inode.c |  107 +++++++++++++++++++++++++++----------------------------
 1 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bbfabf8..7dd9b50 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,7 +892,16 @@ out:
        return err;
 }
 
-#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
 
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                        unsigned long max_blocks, struct buffer_head *bh,
@@ -939,49 +948,31 @@ static int ext4_get_block(struct inode *inode, sector_t 
iblock,
                        struct buffer_head *bh_result, int create)
 {
        handle_t *handle = ext4_journal_current_handle();
-       int ret = 0;
+       int ret = 0, started = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
-       if (!create)
-               goto get_block;         /* A read */
-
-       if (max_blocks == 1)
-               goto get_block;         /* A single block get */
-
-       if (handle->h_transaction->t_state == T_LOCKED) {
-               /*
-                * Huge direct-io writes can hold off commits for long
-                * periods of time.  Let this commit run.
-                */
-               ext4_journal_stop(handle);
-               handle = ext4_journal_start(inode, DIO_CREDITS);
-               if (IS_ERR(handle))
+       if (create && !handle) {
+               /* Direct IO write... */
+               if (max_blocks > DIO_MAX_BLOCKS)
+                       max_blocks = DIO_MAX_BLOCKS;
+               handle = ext4_journal_start(inode, DIO_CREDITS +
+                             2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+               if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-               goto get_block;
-       }
-
-       if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
-               /*
-                * Getting low on buffer credits...
-                */
-               ret = ext4_journal_extend(handle, DIO_CREDITS);
-               if (ret > 0) {
-                       /*
-                        * Couldn't extend the transaction.  Start a new one.
-                        */
-                       ret = ext4_journal_restart(handle, DIO_CREDITS);
+                       goto out;
                }
+               started = 1;
        }
 
-get_block:
-       if (ret == 0) {
-               ret = ext4_get_blocks_wrap(handle, inode, iblock,
+       ret = ext4_get_blocks_wrap(handle, inode, iblock,
                                        max_blocks, bh_result, create, 0);
-               if (ret > 0) {
-                       bh_result->b_size = (ret << inode->i_blkbits);
-                       ret = 0;
-               }
+       if (ret > 0) {
+               bh_result->b_size = (ret << inode->i_blkbits);
+               ret = 0;
        }
+       if (started)
+               ext4_journal_stop(handle);
+out:
        return ret;
 }
 
@@ -1671,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
@@ -1680,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
-       handle_t *handle = NULL;
+       handle_t *handle;
        ssize_t ret;
        int orphan = 0;
        size_t count = iov_length(iov, nr_segs);
@@ -1688,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb 
*iocb,
        if (rw == WRITE) {
                loff_t final_size = offset + count;
 
-               handle = ext4_journal_start(inode, DIO_CREDITS);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
                if (final_size > inode->i_size) {
+                       /* Credits for sb + inode write */
+                       handle = ext4_journal_start(inode, 2);
+                       if (IS_ERR(handle)) {
+                               ret = PTR_ERR(handle);
+                               goto out;
+                       }
                        ret = ext4_orphan_add(handle, inode);
-                       if (ret)
-                               goto out_stop;
+                       if (ret) {
+                               ext4_journal_stop(handle);
+                               goto out;
+                       }
                        orphan = 1;
                        ei->i_disksize = inode->i_size;
+                       ext4_journal_stop(handle);
                }
        }
 
@@ -1706,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb 
*iocb,
                                 offset, nr_segs,
                                 ext4_get_block, NULL);
 
-       /*
-        * Reacquire the handle: ext4_get_block() can restart the transaction
-        */
-       handle = ext4_journal_current_handle();
-
-out_stop:
-       if (handle) {
+       if (orphan) {
                int err;
 
-               if (orphan && inode->i_nlink)
+               /* Credits for sb + inode write */
+               handle = ext4_journal_start(inode, 2);
+               if (IS_ERR(handle)) {
+                       /* This is really bad luck. We've written the data
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+               if (inode->i_nlink)
                        ext4_orphan_del(handle, inode);
-               if (orphan && ret > 0) {
+               if (ret > 0) {
                        loff_t end = offset + ret;
                        if (end > inode->i_size) {
                                ei->i_disksize = end;
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to