Convert the internals of blkdev_direct_IO to use a generic endio function,
instead of directly calling aio_complete.  This may also fix some bugs/races
in this code, for instance it checks bio->bi_size instead of assuming it's
zero, and it atomically accumulates the bytes_done counter (assuming that
the bio completion handler can't race with itself *might* be valid here, but
the direct-io code makes no such assumption).  I'm also pretty sure that
the address_space->directIO functions aren't supposed to mess with the
iocb->ki_pos or ->ki_left.

---

diff -urpN -X dontdiff a/fs/block_dev.c b/fs/block_dev.c
--- a/fs/block_dev.c    2007-01-12 20:26:25.000000000 -0800
+++ b/fs/block_dev.c    2007-01-12 20:23:55.000000000 -0800
@@ -131,10 +131,32 @@ blkdev_get_block(struct inode *inode, se
        return 0;
 }
 
-static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+struct bdev_aio {
+       atomic_t                iocount;        /* refcount */
+       atomic_t                bytes_done;     /* byte counter */
+       int                     err;            /* error handling */
+       file_endio_t            *endio;         /* end I/O notify fn */
+       void                    *endio_data;    /* notify fn private data */
+};
+
+static void blk_io_put(struct bdev_aio *io)
+{
+       if (!atomic_dec_and_test(&io->iocount))
+               return;
+
+       if (!io->endio)
+               return complete((struct completion*)io->endio_data);
+
+       io->endio(io->endio_data, atomic_read(&io->bytes_done), io->err);
+       kfree(io);
+}
+
+static int blk_bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 {
-       struct kiocb *iocb = bio->bi_private;
-       atomic_t *bio_count = &iocb->ki_bio_count;
+       struct bdev_aio *io = bio->bi_private;
+
+       if (bio->bi_size)
+               return 1;
 
        if (bio_data_dir(bio) == READ)
                bio_check_pages_dirty(bio);
@@ -143,16 +165,21 @@ static int blk_end_aio(struct bio *bio, 
                bio_put(bio);
        }
 
-       /* iocb->ki_nbytes stores error code from LLDD */
-       if (error)
-               iocb->ki_nbytes = -EIO;
-
-       if (atomic_dec_and_test(bio_count))
-               aio_complete(iocb, iocb->ki_left, iocb->ki_nbytes);
+       if (error)
+               io->err = error;
+       atomic_add(bytes_done, &io->bytes_done);
 
+       blk_io_put(io);
        return 0;
 }
 
+static void blk_io_init(struct bdev_aio *io)
+{
+       atomic_set(&io->iocount, 1);
+       atomic_set(&io->bytes_done, 0);
+       io->err = 0;
+}
+
 #define VEC_SIZE       16
 struct pvec {
        unsigned short nr;
@@ -208,24 +235,33 @@ blkdev_direct_IO(int rw, struct kiocb *i
 
        unsigned long addr;     /* user iovec address */
        size_t count;           /* user iovec len */
-       size_t nbytes = iocb->ki_nbytes = iocb->ki_left; /* total xfer size */
+       size_t nbytes;           /* total xfer size */
        loff_t size;            /* size of block device */
        struct bio *bio;
-       atomic_t *bio_count = &iocb->ki_bio_count;
+       struct bdev_aio stack_io, *io;
+       file_endio_t *endio = aio_complete;
+       void *endio_data = iocb;
        struct page *page;
        struct pvec pvec;
 
        pvec.nr = 0;
        pvec.idx = 0;
 
+       io = &stack_io;
+       if (endio) {
+               io = kmalloc(sizeof(struct bdev_aio), GFP_KERNEL);
+               if (!io)
+                       return -ENOMEM;
+       }
+       blk_io_init(io);
+
        if (pos & blocksize_mask)
                return -EINVAL;
 
+       nbytes = iov_length(iov, nr_segs);
        size = i_size_read(inode);
-       if (pos + nbytes > size) {
+       if (pos + nbytes > size)
                nbytes = size - pos;
-               iocb->ki_left = nbytes;
-       }
 
        /*
         * check first non-zero iov alignment, the remaining
@@ -237,7 +273,6 @@ blkdev_direct_IO(int rw, struct kiocb *i
                if (addr & blocksize_mask || count & blocksize_mask)
                        return -EINVAL;
        } while (!count && ++seg < nr_segs);
-       atomic_set(bio_count, 1);
 
        while (nbytes) {
                /* roughly estimate number of bio vec needed */
@@ -248,8 +283,8 @@ blkdev_direct_IO(int rw, struct kiocb *i
                /* bio_alloc should not fail with GFP_KERNEL flag */
                bio = bio_alloc(GFP_KERNEL, nvec);
                bio->bi_bdev = I_BDEV(inode);
-               bio->bi_end_io = blk_end_aio;
-               bio->bi_private = iocb;
+               bio->bi_end_io = blk_bio_endio;
+               bio->bi_private = io;
                bio->bi_sector = pos >> blkbits;
 same_bio:
                cur_off = addr & ~PAGE_MASK;
@@ -289,18 +324,27 @@ same_bio:
                /* bio is ready, submit it */
                if (rw == READ)
                        bio_set_pages_dirty(bio);
-               atomic_inc(bio_count);
+               atomic_inc(&io->iocount);
                submit_bio(rw, bio);
        }
 
 completion:
-       iocb->ki_left -= nbytes;
-       nbytes = iocb->ki_left;
-       iocb->ki_pos += nbytes;
+       if (!endio) {
+               struct completion event;
+
+               init_completion(&event);
+               io->endio = NULL;
+               io->endio_data = &event;
+
+               if (!atomic_dec_and_test(&io->iocount))
+                       wait_for_completion(&event);
+               return io->err ? io->err : atomic_read(&io->bytes_done);
+       }
 
-       if (atomic_dec_and_test(bio_count))
-               aio_complete(iocb, nbytes, 0);
+       io->endio = endio;
+       io->endio_data = endio_data;
 
+       blk_io_put(io);
        return -EIOCBQUEUED;
 
 backout:
@@ -316,7 +360,7 @@ backout:
         * if no bio was submmitted, return the error code.
         * otherwise, proceed with pending I/O completion.
         */
-       if (atomic_read(bio_count) == 1)
+       if (atomic_read(&io->iocount) == 1)
                return PTR_ERR(page);
        goto completion;
 }
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to