Commit:     65b8291c4000e5f38fc94fb2ca0cb7e8683c8a1b
Parent:     00e9fa2d6421fbbefb4c02821a1e779a3ce47781
Author:     Zach Brown <[EMAIL PROTECTED]>
AuthorDate: Fri Mar 16 13:38:11 2007 -0800
Committer:  Linus Torvalds <[EMAIL PROTECTED]>
CommitDate: Fri Mar 16 19:25:04 2007 -0700

    [PATCH] dio: invalidate clean pages before dio write
    This patch fixes a user-triggerable oops that was reported by Leonid
    Ananiev as archived at
    dio writes invalidate clean pages that intersect the written region so that
    subsequent buffered reads go to disk to read the new data.  If this fails
    the interface tries to tell the caller that the cache is inconsistent by
    returning EIO.
    Before this patch we had the problem where this invalidation failure would
    clobber -EIOCBQUEUED as it made its way from fs/direct-io.c to fs/aio.c.
    Both fs/aio.c and bio completion call aio_complete() and we reference freed
    memory, usually oopsing.
    This patch addresses this problem by invalidating before the write so that
    we can cleanly return -EIO before ->direct_IO() has had a chance to return
    There is a compromise here.  During the dio write we can fault in mmap()ed
    pages which intersect the written range with get_user_pages() if the user
    provided them for the source buffer.  This is a crazy thing to do, but we
    can make it mostly work in most cases by trying the invalidation again.
    The compromise is that we won't return an error if this second invalidation
    fails if it's an AIO write and we have -EIOCBQUEUED.
    This was tested by having two processes race performing large O_DIRECT and
    buffered ordered writes.  Within minutes ext3 would see a race between
    ext3_releasepage() and jbd holding a reference on ordered data buffers and
    would cause invalidation to fail, panicing the box.  The test can be found
    in the 'aio_dio_bugs' test group in  After this
    patch the test passes.
    Signed-off-by: Zach Brown <[EMAIL PROTECTED]>
    Signed-off-by: Benjamin LaHaise <[EMAIL PROTECTED]>
    Cc: Leonid Ananiev <[EMAIL PROTECTED]>
    Cc: Nick Piggin <[EMAIL PROTECTED]>
    Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
    Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
 mm/filemap.c |   46 +++++++++++++++++++++++++++++++++++-----------
 1 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index d1060b8..5dfc093 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2379,7 +2379,8 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const 
struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        ssize_t retval;
-       size_t write_len = 0;
+       size_t write_len;
+       pgoff_t end = 0; /* silence gcc */
         * If it's a write, unmap all mmappings of the file up-front.  This
@@ -2388,23 +2389,46 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, 
const struct iovec *iov,
        if (rw == WRITE) {
                write_len = iov_length(iov, nr_segs);
+               end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
                if (mapping_mapped(mapping))
                        unmap_mapping_range(mapping, offset, write_len, 0);
        retval = filemap_write_and_wait(mapping);
-       if (retval == 0) {
-               retval = mapping->a_ops->direct_IO(rw, iocb, iov,
-                                               offset, nr_segs);
-               if (rw == WRITE && mapping->nrpages) {
-                       pgoff_t end = (offset + write_len - 1)
-                                               >> PAGE_CACHE_SHIFT;
-                       int err = invalidate_inode_pages2_range(mapping,
+       if (retval)
+               goto out;
+       /*
+        * After a write we want buffered reads to be sure to go to disk to get
+        * the new data.  We invalidate clean cached page from the region we're
+        * about to write.  We do this *before* the write so that we can return
+        * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+        */
+       if (rw == WRITE && mapping->nrpages) {
+               retval = invalidate_inode_pages2_range(mapping,
                                        offset >> PAGE_CACHE_SHIFT, end);
-                       if (err)
-                               retval = err;
-               }
+               if (retval)
+                       goto out;
+       retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
+       if (retval)
+               goto out;
+       /*
+        * Finally, try again to invalidate clean pages which might have been
+        * faulted in by get_user_pages() if the source of the write was an
+        * mmap()ed region of the file we're writing.  That's a pretty crazy
+        * thing to do, so we don't support it 100%.  If this invalidation
+        * fails and we have -EIOCBQUEUED we ignore the failure.
+        */
+       if (rw == WRITE && mapping->nrpages) {
+               int err = invalidate_inode_pages2_range(mapping,
+                                             offset >> PAGE_CACHE_SHIFT, end);
+               if (err && retval >= 0)
+                       retval = err;
+       }
        return retval;
