On 11/29/18 3:55 PM, Jens Axboe wrote:
> The bio referencing has a trick that doesn't do any actual atomic
> inc/dec on the reference count until we have to elevator to > 1. For the
> async IO O_DIRECT case, we can't use the simple DIO variants, so we use
> __blkdev_direct_IO(). It always grabs an extra reference to the bio
> after allocation, which means we then enter the slower path of actually
> having to do atomic_inc/dec on the count.
>
> We don't need to do that for the async case, unless we end up going
> multi-bio, in which case we're already doing huge amounts of IO. For the
> smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref.
I accidentally generated this against the aio-poll branch, here's a
version that is not.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d233a59ea364..5c0a224bf9cb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -281,10 +281,25 @@ struct blkdev_dio {
static struct bio_set blkdev_dio_pool;
+static void blkdev_bio_dirty_release(struct bio *bio, bool should_dirty)
+{
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i)
+ put_page(bvec->bv_page);
+ bio_put(bio);
+ }
+}
+
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty;
+ bool should_free = false;
if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
if (bio->bi_status && !dio->bio.bi_status)
@@ -302,25 +317,20 @@ static void blkdev_bio_end_io(struct bio *bio)
}
dio->iocb->ki_complete(iocb, ret, 0);
- bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
WRITE_ONCE(dio->waiter, NULL);
blk_wake_io_task(waiter);
}
+ /* last bio is done, now we can free the parent holding dio */
+ should_free = dio->multi_bio;
}
- if (should_dirty) {
- bio_check_pages_dirty(bio);
- } else {
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i)
- put_page(bvec->bv_page);
- bio_put(bio);
- }
+ if (!dio->is_sync)
+ blkdev_bio_dirty_release(bio, should_dirty);
+ if (should_free)
+ bio_put(&dio->bio);
}
static ssize_t
@@ -343,14 +353,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
return -EINVAL;
bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
- bio_get(bio); /* extra ref for the completion handler */
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
- if (dio->is_sync)
+ if (dio->is_sync) {
dio->waiter = current;
- else
+ bio_get(bio);
+ } else {
dio->iocb = iocb;
+ }
dio->size = 0;
dio->multi_bio = false;
@@ -400,6 +411,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
}
if (!dio->multi_bio) {
+ /*
+ * async needs an extra reference if we are goign
+ * multi-bio only, so we can ensure that 'dio'
+ * sticks around.
+ */
+ if (!is_sync)
+ bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
@@ -433,6 +451,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter
*iter, int nr_pages)
if (likely(!ret))
ret = dio->size;
+ blkdev_bio_dirty_release(bio, dio->should_dirty);
bio_put(&dio->bio);
return ret;
}
--
Jens Axboe