The bio referencing has a trick that doesn't do any actual atomic
inc/dec on the reference count until we have to elevator to > 1. For the
async IO O_DIRECT case, we can't use the simple DIO variants, so we use
__blkdev_direct_IO(). It always grabs an extra reference to the bio
after allocation, which means we then enter the slower path of actually
having to do atomic_inc/dec on the count.

We don't need to do that for the async case, unless we end up going
multi-bio, in which case we're already doing huge amounts of IO. For the
smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref.

Signed-off-by: Jens Axboe <[email protected]>

---

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f3bd41f80421..44fc9da13f68 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -292,10 +292,26 @@ static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
        return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }
 
+static void blkdev_bio_dirty_release(struct bio *bio, bool should_dirty)
+{
+       if (should_dirty) {
+               bio_check_pages_dirty(bio);
+       } else {
+               struct bio_vec *bvec;
+               int i;
+
+               if (!bio_flagged(bio, BIO_HOLD_PAGES))
+                       bio_for_each_segment_all(bvec, bio, i)
+                               put_page(bvec->bv_page);
+               bio_put(bio);
+       }
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
        struct blkdev_dio *dio = bio->bi_private;
        bool should_dirty = dio->should_dirty;
+       bool should_free = false;
 
        if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
                if (bio->bi_status && !dio->bio.bi_status)
@@ -313,26 +329,20 @@ static void blkdev_bio_end_io(struct bio *bio)
                        }
 
                        dio->iocb->ki_complete(iocb, ret, 0);
-                       bio_put(&dio->bio);
                } else {
                        struct task_struct *waiter = dio->waiter;
 
                        WRITE_ONCE(dio->waiter, NULL);
                        blk_wake_io_task(waiter);
                }
+               /* last bio is done, now we can free the parent holding dio */
+               should_free = dio->multi_bio;
        }
 
-       if (should_dirty) {
-               bio_check_pages_dirty(bio);
-       } else {
-               struct bio_vec *bvec;
-               int i;
-
-               if (!bio_flagged(bio, BIO_HOLD_PAGES))
-                       bio_for_each_segment_all(bvec, bio, i)
-                               put_page(bvec->bv_page);
-               bio_put(bio);
-       }
+       if (!dio->is_sync)
+               blkdev_bio_dirty_release(bio, should_dirty);
+       if (should_free)
+               bio_put(&dio->bio);
 }
 
 static ssize_t
@@ -355,14 +365,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
                return -EINVAL;
 
        bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
-       bio_get(bio); /* extra ref for the completion handler */
 
        dio = container_of(bio, struct blkdev_dio, bio);
        dio->is_sync = is_sync = is_sync_kiocb(iocb);
-       if (dio->is_sync)
+       if (dio->is_sync) {
                dio->waiter = current;
-       else
+               bio_get(bio);
+       } else {
                dio->iocb = iocb;
+       }
 
        dio->size = 0;
        dio->multi_bio = false;
@@ -425,6 +436,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
                }
 
                if (!dio->multi_bio) {
+                       /*
+                        * async needs an extra reference if we are goign
+                        * multi-bio only, so we can ensure that 'dio'
+                        * sticks around.
+                        */
+                       if (!is_sync)
+                               bio_get(bio);
                        dio->multi_bio = true;
                        atomic_set(&dio->ref, 2);
                } else {
@@ -458,6 +476,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
        if (likely(!ret))
                ret = dio->size;
 
+       blkdev_bio_dirty_release(bio, dio->should_dirty);
        bio_put(&dio->bio);
        return ret;
 }

-- 
Jens Axboe

Reply via email to