[PATCH 19/22] block: Rename bio_split() - bio_pair_split()
This is prep work for introducing a more general bio_split(). Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: NeilBrown ne...@suse.de Cc: Alasdair Kergon a...@redhat.com Cc: Lars Ellenberg lars.ellenb...@linbit.com Cc: Peter Osterlund pete...@telia.com Cc: Sage Weil s...@inktank.com --- drivers/block/pktcdvd.c | 2 +- drivers/md/linear.c | 2 +- drivers/md/raid0.c | 6 +++--- drivers/md/raid10.c | 2 +- fs/bio.c| 4 ++-- include/linux/bio.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index f16dfca..c4c8871 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2413,7 +2413,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) if (last_zone != zone) { BUG_ON(last_zone != zone + pd-settings.size); first_sectors = last_zone - bio-bi_iter.bi_sector; - bp = bio_split(bio, first_sectors); + bp = bio_pair_split(bio, first_sectors); BUG_ON(!bp); pkt_make_request(q, bp-bio1); pkt_make_request(q, bp-bio2); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fb3b0d0..e9b53e9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) rcu_read_unlock(); - bp = bio_split(bio, end_sector - bio-bi_iter.bi_sector); + bp = bio_pair_split(bio, end_sector - bio-bi_iter.bi_sector); linear_make_request(mddev, bp-bio1); linear_make_request(mddev, bp-bio2); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 8ee1a6c..ea754dd 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -534,11 +534,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) * refuse to split for us, so we need to split it. */ if (likely(is_power_of_2(chunk_sects))) - bp = bio_split(bio, chunk_sects - (sector + bp = bio_pair_split(bio, chunk_sects - (sector (chunk_sects-1))); else - bp = bio_split(bio, chunk_sects - - sector_div(sector, chunk_sects)); + bp = bio_pair_split(bio, chunk_sects - + sector_div(sector, chunk_sects)); raid0_make_request(mddev, bp-bio1); raid0_make_request(mddev, bp-bio2); bio_pair_release(bp); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2303f59..7e49cbe 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1193,7 +1193,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, chunk_sects - + bp = bio_pair_split(bio, chunk_sects - (bio-bi_iter.bi_sector (chunk_sects - 1))); /* Each of these 'make_request' calls will call 'wait_barrier'. diff --git a/fs/bio.c b/fs/bio.c index 7737984..5767f97 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1746,7 +1746,7 @@ static void bio_pair_end_2(struct bio *bi, int err) /* * split a bio - only worry about a bio with a single page in its iovec */ -struct bio_pair *bio_split(struct bio *bi, int first_sectors) +struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors) { struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); @@ -1793,7 +1793,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) return bp; } -EXPORT_SYMBOL(bio_split); +EXPORT_SYMBOL(bio_pair_split); /** * bio_trim - trim a bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 1b06bcb..bc3fa72 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -298,7 +298,7 @@ struct bio_pair { atomic_tcnt; int error; }; -extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); +extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); extern void bio_trim(struct bio *bio, int offset, int size); -- 1.8.4.rc1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/22] block: Kill bio_segments()/bi_vcnt usage
When we start sharing biovecs, keeping bi_vcnt accurate for splits is going to be error prone - and unnecessary, if we refactor some code. So bio_segments() has to go - but most of the existing users just needed to know if the bio had multiple segments, which is easier - add a bio_multiple_segments() for them. (Two of the current uses of bio_segments() are going to go away in a couple patches, but the current implementation of bio_segments() is unsafe as soon as we start doing driver conversions for immutable biovecs - so implement a dumb version for bisectability, it'll go away in a couple patches) Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Neil Brown ne...@suse.de Cc: Nagalakshmi Nandigama nagalakshmi.nandig...@lsi.com Cc: Sreekanth Reddy sreekanth.re...@lsi.com Cc: James E.J. Bottomley jbottom...@parallels.com --- drivers/block/ps3disk.c | 7 ++--- drivers/md/bcache/io.c | 53 ++-- drivers/md/raid0.c | 2 +- drivers/md/raid10.c | 2 +- drivers/message/fusion/mptsas.c | 8 ++--- drivers/scsi/libsas/sas_expander.c | 8 ++--- drivers/scsi/mpt2sas/mpt2sas_transport.c | 10 +++--- drivers/scsi/mpt3sas/mpt3sas_transport.c | 8 ++--- fs/bio.c | 2 +- include/linux/bio.h | 43 +- 10 files changed, 75 insertions(+), 68 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 464be78..8d1a19c 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -101,10 +101,9 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, rq_for_each_segment(bvec, req, iter) { unsigned long flags; - dev_dbg(dev-sbd.core, - %s:%u: bio %u: %u segs %u sectors from %lu\n, - __func__, __LINE__, i, bio_segments(iter.bio), - bio_sectors(iter.bio), iter.bio-bi_iter.bi_sector); + dev_dbg(dev-sbd.core, %s:%u: bio %u: %u sectors from %lu\n, + __func__, __LINE__, i, bio_sectors(iter.bio), + iter.bio-bi_iter.bi_sector); size = bvec-bv_len; buf = bvec_kmap_irq(bvec, flags); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 9b5b6a4..6e04f3b 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -24,7 +24,8 @@ static void bch_generic_make_request_hack(struct bio *bio) if (bio-bi_iter.bi_idx) { struct bio_vec bv; struct bvec_iter iter; - struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); + unsigned segs = bio_segments(bio); + struct bio *clone = bio_alloc(GFP_NOIO, segs); bio_for_each_segment(bv, bio, iter) clone-bi_io_vec[clone-bi_vcnt++] = bv; @@ -32,7 +33,7 @@ static void bch_generic_make_request_hack(struct bio *bio) clone-bi_iter.bi_sector = bio-bi_iter.bi_sector; clone-bi_bdev = bio-bi_bdev; clone-bi_rw= bio-bi_rw; - clone-bi_vcnt = bio_segments(bio); + clone-bi_vcnt = segs; clone-bi_iter.bi_size = bio-bi_iter.bi_size; clone-bi_private = bio; @@ -133,40 +134,32 @@ out: static unsigned bch_bio_max_sectors(struct bio *bio) { - unsigned ret = bio_sectors(bio); struct request_queue *q = bdev_get_queue(bio-bi_bdev); - unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, - queue_max_segments(q)); + struct bio_vec bv; + struct bvec_iter iter; + unsigned ret = 0, seg = 0; if (bio-bi_rw REQ_DISCARD) - return min(ret, q-limits.max_discard_sectors); - - if (bio_segments(bio) max_segments || - q-merge_bvec_fn) { - struct bio_vec bv; - struct bvec_iter iter; - unsigned seg = 0; - - ret = 0; + return min(bio_sectors(bio), q-limits.max_discard_sectors); - bio_for_each_segment(bv, bio, iter) { - struct bvec_merge_data bvm = { - .bi_bdev= bio-bi_bdev, - .bi_sector = bio-bi_iter.bi_sector, - .bi_size= ret 9, - .bi_rw = bio-bi_rw, - }; - - if (seg == max_segments) - break; + bio_for_each_segment(bv, bio, iter) { + struct bvec_merge_data bvm = { + .bi_bdev= bio-bi_bdev, + .bi_sector = bio-bi_iter.bi_sector
Re: [PATCH-v3 1/4] idr: Percpu ida
On Tue, Aug 20, 2013 at 02:31:57PM -0700, Andrew Morton wrote: On Fri, 16 Aug 2013 23:09:06 + Nicholas A. Bellinger n...@linux-iscsi.org wrote: From: Kent Overstreet k...@daterainc.com Percpu frontend for allocating ids. With percpu allocation (that works), it's impossible to guarantee it will always be possible to allocate all nr_tags - typically, some will be stuck on a remote percpu freelist where the current job can't get to them. We do guarantee that it will always be possible to allocate at least (nr_tags / 2) tags - this is done by keeping track of which and how many cpus have tags on their percpu freelists. On allocation failure if enough cpus have tags that there could potentially be (nr_tags / 2) tags stuck on remote percpu freelists, we then pick a remote cpu at random to steal from. Note that there's no cpu hotplug notifier - we don't care, because steal_tags() will eventually get the down cpu's tags. We _could_ satisfy more allocations if we had a notifier - but we'll still meet our guarantees and it's absolutely not a correctness issue, so I don't think it's worth the extra code. ... include/linux/idr.h | 53 + lib/idr.c | 316 +-- I don't think this should be in idr.[ch] at all. It has no relationship with the existing code. Apart from duplicating its functionality :( Well, in the full patch series it does make use of the non-percpu ida. I'm still hoping to get the ida/idr rewrites in. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH-v3 1/4] idr: Percpu ida
On Wed, Aug 21, 2013 at 06:25:58PM +, Christoph Lameter wrote: On Fri, 16 Aug 2013, Nicholas A. Bellinger wrote: + spinlock_t lock; Remove the spinlock. As Andrew noted, the spinlock is needed because of tag stealing. (You don't think I'd stick a spinlock on a percpu data structure without a real reason, would you?) + unsignednr_free; + unsignedfreelist[]; +}; + +static inline void move_tags(unsigned *dst, unsigned *dst_nr, +unsigned *src, unsigned *src_nr, +unsigned nr) +{ + *src_nr -= nr; + memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr); + *dst_nr += nr; +} + +static inline unsigned alloc_local_tag(struct percpu_ida *pool, + struct percpu_ida_cpu *tags) Pass the __percpu offset and not the tags pointer. Why? It just changes where the this_cpu_ptr +{ + int tag = -ENOSPC; + + spin_lock(tags-lock); Interupts are already disabled. Drop the spinlock. + if (tags-nr_free) + tag = tags-freelist[--tags-nr_free]; You can keep this or avoid address calculation through segment prefixes. F.e. if (__this_cpu_read(tags-nrfree) { int n = __this_cpu_dec_return(tags-nr_free); tag = __this_cpu_read(tags-freelist[n]); } Can you explain what the point of that change would be? It sounds like it's preferable to do it that way and avoid this_cpu_ptr() for some reason, but you're not explaining why. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] Bcache fix for 3.11
Last writeback fix had a brown paper bag bug in it. The fix for that is a one liner though: The following changes since commit 9df79a37eeb7ee89dc56aeed3051be676369359a: bcache: Fix a flush/fua performance bug (2013-08-19 15:30:45 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11 for you to fetch changes up to 174c1412939888ffedc4da02b375bc579af2a93c: bcache: Fix a dumb CPU spinning bug in writeback (2013-08-23 16:30:22 -0700) Kent Overstreet (1): bcache: Fix a dumb CPU spinning bug in writeback drivers/md/bcache/writeback.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Bcache fixes for stable
Two fixes that went into 3.11 should've been marked for stable, but I screwed up and forgot to mark them. One of them fixes bugs that appear to being hit in the wild, so please pull these: The following changes since commit 0a4b6d4ff200a553951f77f765971cb3e4c91ec0: Linux 3.10.9 (2013-08-20 15:40:47 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.10 for you to fetch changes up to a213846fafcdf52153dfebf8b1671498a72b6285: bcache: FUA fixes (2013-08-26 14:56:52 -0700) Kent Overstreet (1): bcache: FUA fixes Kumar Amit Mehta (1): md: bcache: io.c: fix a potential NULL pointer dereference drivers/md/bcache/btree.c | 23 +-- drivers/md/bcache/io.c | 2 ++ drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/request.c | 13 - 4 files changed, 36 insertions(+), 4 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Bcache fixes for stable
On Mon, Aug 26, 2013 at 03:19:57PM -0700, Greg Kroah-Hartman wrote: On Mon, Aug 26, 2013 at 03:02:49PM -0700, Kent Overstreet wrote: Two fixes that went into 3.11 should've been marked for stable, but I screwed up and forgot to mark them. One of them fixes bugs that appear to being hit in the wild, so please pull these: The following changes since commit 0a4b6d4ff200a553951f77f765971cb3e4c91ec0: Linux 3.10.9 (2013-08-20 15:40:47 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.10 for you to fetch changes up to a213846fafcdf52153dfebf8b1671498a72b6285: bcache: FUA fixes (2013-08-26 14:56:52 -0700) Kent Overstreet (1): bcache: FUA fixes Kumar Amit Mehta (1): md: bcache: io.c: fix a potential NULL pointer dereference Can I get a hint as to what the git commit ids of these patches are in Linus's tree? 5c694129c8db6d89c9be109049a16510b2f70f6d e49c7c374e7aacd1f04ecbc21d9dbbeeea4a77d6 Ideally, that's all you need to send me, but if you want to send me a git pull, that's fine too, but please use a signed tag, otherwise it's just a random tree... *nod* I need to get some more signatures on my gpg key... Also, if you do a git pull, I need the git commit ids in the changelog entry that match Linus's tree, so people (like me) can verify things. Ok, I'll remember that for the future. For now, I can edit these two patches and add them in, if you give me the git commit ids to verify them. Ok - I don't care too much either way but there were some slightly nontrivial conflicts on the second patch, and the one I did backported has been tested. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
-08-26 21:05:27 INFO: task ceph-osd:8798 blocked for more than 120 seconds. Stefan Am 22.08.2013 09:32, schrieb Stefan Priebe - Profihost AG: great! Everything seems to work fine now! Except read_dirty always going to negative values after a reboot. Stefan Am 22.08.2013 08:02, schrieb Kent Overstreet: On Thu, Aug 22, 2013 at 07:59:04AM +0200, Stefan Priebe wrote: schedule_timeout() is not the same as schedule_timeout_interruptible(). just search and replace? So i can try on my own. The one in read_dirty(), line ~330 -- To unsubscribe from this list: send the line unsubscribe linux-bcache in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-bcache in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH-v4 1/6] idr: Percpu ida
On Tue, Aug 27, 2013 at 01:43:14PM -0700, Andrew Morton wrote: On Tue, 27 Aug 2013 20:11:41 + Nicholas A. Bellinger n...@linux-iscsi.org wrote: Percpu frontend for allocating ids. With percpu allocation (that works), it's impossible to guarantee it will always be possible to allocate all nr_tags - typically, some will be stuck on a remote percpu freelist where the current job can't get to them. We do guarantee that it will always be possible to allocate at least (nr_tags / 2) tags - this is done by keeping track of which and how many cpus have tags on their percpu freelists. On allocation failure if enough cpus have tags that there could potentially be (nr_tags / 2) tags stuck on remote percpu freelists, we then pick a remote cpu at random to steal from. Note that there's no cpu hotplug notifier - we don't care, because steal_tags() will eventually get the down cpu's tags. We _could_ satisfy more allocations if we had a notifier - but we'll still meet our guarantees and it's absolutely not a correctness issue, so I don't think it's worth the extra code. Except for one silly trivial thing, all of my August 20 review comments were ignored. You didn't even bother replying to the email. Sorry! I remember seeing that email now, but somehow I completely missed all the review comments after the first one. Mea culpa! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote: Vanilla 3.10.7 + bcache: Fix a writeback performance regression http://pastebin.com/raw.php?i=LXZk4cMH Whoops, at first I thought this was the same bug as one I'd already been chasing down that had been a harmless bug - turns out I didn't look closely enough at the backtrace. What happened is background writeback is deadlocking, because for some reason the workqueue it's running out of is a singlethreaded workqueue, so as soon as it decides to queue enough writeback bios that it has to sleep on that semaphore (which often won't happen due to the PD controller based ratelimiting) - boom, deadlock. Here's the fixup patch I just tested and am applying: From 0af68de350e05e43fd093b36dcb0fe8aa838fabf Mon Sep 17 00:00:00 2001 From: Kent Overstreet k...@daterainc.com Date: Mon, 19 Aug 2013 15:26:22 -0700 Subject: [PATCH] bcache: Fix a writeback deadlock Signed-off-by: Kent Overstreet k...@daterainc.com diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f88c62e..27ac519 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -351,7 +351,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(io-bio, cl, io-dc-disk); - continue_at(cl, write_dirty_finish, dirty_wq); + continue_at(cl, write_dirty_finish, system_wq); } static void read_dirty_endio(struct bio *bio, int error) @@ -371,7 +371,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(io-bio, cl, io-dc-disk); - continue_at(cl, write_dirty, dirty_wq); + continue_at(cl, write_dirty, system_wq); } static void read_dirty(struct closure *cl) @@ -512,7 +512,7 @@ void bch_writeback_exit(void) int __init bch_writeback_init(void) { - dirty_wq = create_singlethread_workqueue(bcache_writeback); + dirty_wq = create_workqueue(bcache_writeback); if (!dirty_wq) return -ENOMEM; -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida
On Tue, Aug 20, 2013 at 02:29:56PM -0700, Andrew Morton wrote: On Tue, 20 Aug 2013 14:19:06 -0700 Nicholas A. Bellinger n...@linux-iscsi.org wrote: On Thu, 2013-08-08 at 14:32 +, Christoph Lameter wrote: On Wed, 7 Aug 2013, Kent Overstreet wrote: One thing that was bugging me - I was never able to figure out for sure if smp_processor_id() returns a number in the range [0, nr_cpu_ids), at least I couldn't find where it was documented - could you tell me if that's true? I always assumed that it was in the range 0 ... nr_cpu_ids - 1 and that is the assumption under which the kernel code was written. Things would break horribly if smp_process_id would return nr_cpu_ids or higher. Hi guys, Just a heads up that I've put Kent's standalone percpu-ida patch (with Christoph's recommend changes) into target-pending/for-next here: https://git.kernel.org/cgit/linux/kernel/git/nab/target-pending.git/commit/?h=for-nextid=47bd524a5b3eb6429b058b8b562b45329ab2c9e7 I've got a number of target patches that depend on this code for v3.12, and a delay on this particular piece would be painful to endure.. Sooo, please yell loudly if there is an objection to percpu-ida merge as a completely standalone item, that does not effect any existing ida code. Was hoping that Tejun had time. I'll take a look... I think Tejun and I might be at a bit of an impasse with the ida rewrite itself, but I don't think there were any outstanding objections to the percpu ida code itself - and this is a standalone version. I was meaning to ask you Andrew, if you could take a look at the ida discussion and lend your opinion - I don't think there's any _specific_ technical objections left to my ida code, and it's now on a more philisophical complexity vs. ... level. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] bcache: Remove use of down/up_read_non_owner()
On Tue, Aug 20, 2013 at 11:16:02AM -0400, Steven Rostedt wrote: The down/up_read_non_owner() is a nasty hack in the API of the rwsem operations. It was once removed, but then resurrected for use with bcache. Not only is the API an abomination to the rwsem API, it also prevents bcache from ever being compiled with PREEMPT_RT, as the RT kernel requires all rwsems to have an owner. Instead of keeping the down/up_read_non_owner() around, it is better to modify the one user of it and have it do something a bit differently. From reading the bcache code, the reason for the non_owner usage is that a request is made, and writers must wait for that request to finish before they can continue. But because the request is completed by another task, the rwsem can not be held for read and then released on completion. Instead of abusing the rwsem code for this, I added a refcount nr_requests to the cached_dev structure as well as a write_waiters wait queue. When a request is to be made, the rwsem is still taken for read, but this time with an owner. The refcount is incremented and before exiting the function, the rwsem is released. The writer will then take the rwsem for write, check the refcount, if it is not zero, it will release the rwsem, add itself to a wait_event() waiting for refcount to become zero, and then try again. I _really_ disagree with this approach. I get that there's a problem, but the bcache code REALLY IS USING THE RWSEM AS A LOCK; the answer isn't to open code the lock! Apologies to Christoph for getting distracted and not responding when you started to explain what the issues were for RT. I'm not really convinced they're that insurmountable (one of the issues was debugging, which the _non_owner() stuff always handled just fine), but I'll take it on faith that this usage is incompatible with rwsems + the RT functionality since I haven't actually had time to dig into it. So assuming that's the case, IMO the sanest thing to do is make a new type of lock - rwsem_non_process or somesuch - and use _that_ in bcache. Not open coding the lock. It can even live in the bcache code if we want since there currently wouldn't be any other users, I don't really care. But open coding it? Come on... makes me wonder what other code in the kernel is open coding locks because it couldn't release it in the same process context that took the lock for whatever reason. Also, nack this patch because increasing the number of atomic ops to shared cachelines in our fast path. If it does end up being open coded, I'll make a more efficient version. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida
On Tue, Aug 20, 2013 at 10:07:42PM -0400, Tejun Heo wrote: Hello, Kent. On Tue, Aug 20, 2013 at 07:01:32PM -0700, Kent Overstreet wrote: I think Tejun and I might be at a bit of an impasse with the ida rewrite itself, but I don't think there were any outstanding objections to the percpu ida code itself - and this is a standalone version. The percpu ida code can be applied separately from the ida rewrite? Yes - at the cost of using significantly more memory for the global freelist I was meaning to ask you Andrew, if you could take a look at the ida discussion and lend your opinion - I don't think there's any _specific_ technical objections left to my ida code, and it's now on a more philisophical complexity vs. ... level. Hmmm... the objection was pretty specific - don't depend on high-order or vmalloc allocations when it can be easily avoided by using proper radix tree. We only do vmalloc allocations for CONFIG_COMPACTION=n, and then only when we need to allocate more than almost 1 _billion_ ids from a single ida (twice than on 32 bit, so never because that gets us just about to INT_MAX) - and then it's only 32k of vmalloc memory, for the entire ida. This is with max allocation order of 4 for COMPACTION=y, 2 for COMPACTION=n. All this for a performance improvement of 10x to 50x (or more), for the ida sizes I measured. So I could see your point if we were allocating gobs of vmalloc memory, or high order allocations big enough to realistically be problematic (I honestly don't think these will be) - but to me, this seems like a pretty reasonable tradeoff for those performance gains. (And the performance gains do substantially come from using more contiguous memory and treating the whole data structure as an array, and doing less pointer chasing/looping) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] bcache: Remove use of down/up_read_non_owner()
On Tue, Aug 20, 2013 at 10:41:53PM -0400, Steven Rostedt wrote: I get that there's a problem, but the bcache code REALLY IS USING THE RWSEM AS A LOCK; the answer isn't to open code the lock! Actually, it is using it as a semaphore. The problem with Linux was that it only had spin locks and semaphores. People would use the semaphore as either a completion or a mutex. Luckily, we created both of those and replaced 90% of all semaphores with either a mutex or a completion. The rwsem, sorta has the same issue. But it was pretty much always used as a read/write sleeping lock. It was not used as a semaphore. But it seems that the bcache really does use it as a semaphore here as it isn't just a mutex location. It's a special kind of completion, but the completion is in a strange way. I think we're mostly quibbling over semantics here, and it's not that big a deal - that said, I don't really get your distinction between a semaphore and a mutex; I'd distinguish them by saying a semaphore can count an arbitrary number of resources (the number it's initialized to), and a mutex is always initialized to one - the fact that in the kernel mutexes must be released by the same process that took them (while important for PI and debugging) is not fundamental. rw semaphore never made any sense to me; they're not counting anything, like regular semaphores. They're just sleepable rw locks. So it _sounds_ like you're saying bcache is using it as a semaphore, beacuse it's using it as a lock we don't release in the original context? in analogy to linux mutexes and semaphores? Not quite sure what you mean. Apologies to Christoph for getting distracted and not responding when you started to explain what the issues were for RT. I'm not really convinced they're that insurmountable (one of the issues was debugging, which the _non_owner() stuff always handled just fine), but I'll take it on faith that this usage is incompatible with rwsems + the RT functionality since I haven't actually had time to dig into it. The thing is, RT requires priority inheritance. When a task blocks on a rwsem, it has to boost the owner of that rwsem otherwise it risks unbounded priority inversion. I have a hack that actually implements a work around for non_owner, but it adds overhead to all locks to do so. Ok - I'd just be curious why the PI bit can't be factored out into a wrapper like how the debug stuff is handled with the _non_owner bits, but I can certainly believe that. So assuming that's the case, IMO the sanest thing to do is make a new type of lock - rwsem_non_process or somesuch - and use _that_ in bcache. Not open coding the lock. I actually started that, but decided this was the easier patch to send. Don't worry, I was expecting this email. No surprises here ;-) This email was taken from Andrew Morton's play book (I see you Cc'd him, I forgot to). It's one of these patches of It's so bad that the owner of the code will fix the issue out of fear that this patch may make it in. Ah, I see :) It can even live in the bcache code if we want since there currently wouldn't be any other users, I don't really care. But open coding it? Come on... makes me wonder what other code in the kernel is open coding locks because it couldn't release it in the same process context that took the lock for whatever reason. Most cases just have completions. This looks like a case where don't do something while transaction is taking place. Which can usually get away with a wait event. Eh, only in sense that it's easy to implement mutexes and rwlocks with wait_event(). To simplify the algorithm just a bit (only leaving out some optimizations), bcache is using it to protect a rb tree, which containts things that are undergoing background writeback. For foreground writes, we've got to check if the write overlaps with anything in this rb tree. If so, we force _this_ write to writeback; background writeback will write stale data to the backing device, but that's fine because the data in the cache will still be marked as dirty. To add stuff to this rb tree - i.e. for background writeback to start flushing some dirty data - it's got to make sure it's not going to overwrite some in progress foreground writethrough write. Tracking every outstanding foreground write would be expensive, so foreground writes take a read lock on the rb tree (both to check it for anything they overlap with, and for their duration), and background writeback takes a write lock when it goes to scan for dirty data to add to the rb tree. Even if you complain it's not _just_ protecting the rb tree, we're still using it for mutual exclusion, plain and simple, and that's all a lock is. Also, nack this patch because increasing the number of atomic ops to shared cachelines in our fast path. If it does end up being open coded, I'll make a more efficient version. Perhaps I can whip up a struct rwsem_non_owner and have a
Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida
On Wed, Aug 21, 2013 at 07:59:41AM -0400, Tejun Heo wrote: Hello, Kent. On Tue, Aug 20, 2013 at 07:31:51PM -0700, Kent Overstreet wrote: All this for a performance improvement of 10x to 50x (or more), for the ida sizes I measured. That's misleading, isn't it? It's comparing it to the existing version that actually exists, instead of comparing it to a hypothetical approach that doesn't exist yet. I don't see how that's misleading. We should see large performance improvements even without the large pages. What matters more is the leaf node performance for vast majority of cases and an extra radix tree layer on top would cover most of whatever is left. Whether to use high order pages or not only affects the extreme niche use cases and I don't think going for high order pages to micro optimize those extreme use cases is the right trade off. So I could see your point if we were allocating gobs of vmalloc memory, or high order allocations big enough to realistically be problematic (I honestly don't think these will be) - but to me, this seems like a pretty reasonable tradeoff for those performance gains. The trade off is made up as the bulk of the performance benefit can be gained without resorting to high order allocations. I'm more and more skeptical that that's true, and it's a given that it wouldn't be as fast. (And the performance gains do substantially come from using more contiguous memory and treating the whole data structure as an array, and doing less pointer chasing/looping) I really have hard time buying that. Let's say you go with single page leaf node and an extra single page layer on top. How many IDs are we talking about? For the cases which are most performance sensitive, this doesn't even matter a bit as percpu caching layer would be on top anyway. I really don't think the micro optimization is called for at the cost of high order allocations from low level tool library. These micro optimizations mean either less pointer chasing or less branching in the _common_ case; you'd trade common case performance for avoiding ever doing higher order allocations (and 2 with COMPACTION=n and 4 with COMPACTION=y is not particularly high order!). I don't buy that that's a good tradeoff. If you're convinced radix trees are the way to go and it can be done without much performance cost, why not code it up and show us? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote: Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG: Am 20.08.2013 00:27, schrieb Kent Overstreet: On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote: Vanilla 3.10.7 + bcache: Fix a writeback performance regression http://pastebin.com/raw.php?i=LXZk4cMH Whoops, at first I thought this was the same bug as one I'd already been chasing down that had been a harmless bug - turns out I didn't look closely enough at the backtrace. What happened is background writeback is deadlocking, because for some reason the workqueue it's running out of is a singlethreaded workqueue, so as soon as it decides to queue enough writeback bios that it has to sleep on that semaphore (which often won't happen due to the PD controller based ratelimiting) - boom, deadlock. Here's the fixup patch I just tested and am applying: Oh i'm now seeing very high CPU spikes of kworker... i don't see if i remove bcache: Fix a writeback performance regression. *swears* I just saw that this morning, but I assumed it was a bug in my stripe aware scan code that isn't in this branch yet - thanks for letting me know that's not the case. I shall work on a fix for the fix... -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida
On Wed, Aug 21, 2013 at 05:16:50PM -0400, Tejun Heo wrote: Hello, Kent. On Wed, Aug 21, 2013 at 02:09:01PM -0700, Kent Overstreet wrote: These micro optimizations mean either less pointer chasing or less branching in the _common_ case; you'd trade common case performance for avoiding ever doing higher order allocations (and 2 with COMPACTION=n and 4 with COMPACTION=y is not particularly high order!). Order 4 allocation probably isn't as bad as before but it still is a lot nastier than single page allocations. You say doing it the other way would harm the common case performance but didn't answer my question about the number of IDs being served per page. How many can be served from a single page? And how many from two layer single page configuration? How are you defining the common case? With single page allocations: 1 15 bits per page 1 9 pointers per page So two layers of pointers does get us to 1 33 bits, which is what we need. But now, since we need two layers of pointers instead of one, we need either another pointer deref for a node lookup - _always_, even when we've got 8 bytes of bits - or we need to branch on the depth of the tree, which is something we don't have now. This is extra overhead _no matter the size of the ida_, over my current approach. I'm assuming the common case is one page of bits, based on the usage I've seen throughout the kernel that's probably way conservative. In that case, your approach is going to be slower than mine, and there's no difference in the size of the allocations. I don't buy that that's a good tradeoff. If you're convinced radix trees are the way to go and it can be done without much performance cost, why not code it up and show us? Well, I'm not the one trying to rewrite ida, so the onus to justify the proposed code is primarily on you. Another thing is that the proposed code is *not* using the existing radix tree and instead implementing its own simplified radix tree, which *can* be fine but the bar to clear is fairly high. You have to be able to show *clearly* that using the existing radix tree is not an option. Until now, the only thing that I gathered is the simplified thing is gonna be faster in some extreme cases while having clear disadvantage in terms of memory allocation. Not very convincing. I've already shown massive performance gains over the existing radix tree approach, you're the one claiming a different approach would be better. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote: Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG: Am 20.08.2013 00:27, schrieb Kent Overstreet: On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote: Vanilla 3.10.7 + bcache: Fix a writeback performance regression http://pastebin.com/raw.php?i=LXZk4cMH Whoops, at first I thought this was the same bug as one I'd already been chasing down that had been a harmless bug - turns out I didn't look closely enough at the backtrace. What happened is background writeback is deadlocking, because for some reason the workqueue it's running out of is a singlethreaded workqueue, so as soon as it decides to queue enough writeback bios that it has to sleep on that semaphore (which often won't happen due to the PD controller based ratelimiting) - boom, deadlock. Here's the fixup patch I just tested and am applying: Oh i'm now seeing very high CPU spikes of kworker... i don't see if i remove bcache: Fix a writeback performance regression. Are you able to reproduce it? I'm not having any luck reproducing it... -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
[a010c49e] request_write+0x19e/0x380 [bcache] Am 22.08.2013 01:47, schrieb Kent Overstreet: On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote: Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG: Am 20.08.2013 00:27, schrieb Kent Overstreet: On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote: Vanilla 3.10.7 + bcache: Fix a writeback performance regression http://pastebin.com/raw.php?i=LXZk4cMH Whoops, at first I thought this was the same bug as one I'd already been chasing down that had been a harmless bug - turns out I didn't look closely enough at the backtrace. What happened is background writeback is deadlocking, because for some reason the workqueue it's running out of is a singlethreaded workqueue, so as soon as it decides to queue enough writeback bios that it has to sleep on that semaphore (which often won't happen due to the PD controller based ratelimiting) - boom, deadlock. Here's the fixup patch I just tested and am applying: Oh i'm now seeing very high CPU spikes of kworker... i don't see if i remove bcache: Fix a writeback performance regression. Are you able to reproduce it? I'm not having any luck reproducing it... -- To unsubscribe from this list: send the line unsubscribe linux-bcache in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
On Thu, Aug 22, 2013 at 07:59:04AM +0200, Stefan Priebe wrote: schedule_timeout() is not the same as schedule_timeout_interruptible(). just search and replace? So i can try on my own. The one in read_dirty(), line ~330 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bcache: Fix a writeback performance regression
On Thu, Aug 22, 2013 at 09:32:52AM +0200, Stefan Priebe - Profihost AG wrote: great! Everything seems to work fine now! Except read_dirty always going to negative values after a reboot. That one's fixed in 3.11 :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 04/22] block: Abstract out bvec iterator
On Tue, Aug 13, 2013 at 10:03:04AM -0400, Ed Cashin wrote: On Aug 9, 2013, Ed Cashin wrote: On Aug 8, 2013, at 9:05 PM, Kent Overstreet wrote: ... It's in the for-jens branch now. Just examining the patches, I like the way it cleans up the aoe code. I had a question about a new BUG added by the for-jens branch the read-response handling path of the aoe driver. The aoe driver in linux-bcache/for-jens commit 4c36c973a8f45 is passing my tests. Here is a patch against that branch illustrating my suggestion for handling bad target responses gracefully. Thanks - shall I just fold that into the aoe immutable bvec patch? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Fri, Aug 09, 2013 at 10:57:56AM -0400, Tejun Heo wrote: Hello, On Wed, Aug 07, 2013 at 01:51:17PM -0700, Kent Overstreet wrote: + * So if the max section size is 64k, that's ~4096 sections, with 8 byte + * pointers that's a little over 32k for the pointers to sections. + * + * That means max size sections are order 4 page allocations. Order 4 allocations for common data structure doesn't really sound like a good idea to me. It's gonna work fine on relatively large machines but suck on mobile / small embedded devices, many of which are still struggling with 32bit address space and compaction may not be enabled. It just doens't make sense to me to impose 64k allocations from low level library function like ida. I have a hard time seeing how it could really be an issue in practice - keep in mind, for every bit in the ida tree we're going to have some struct allocated somewhere else that the id correspends to. So for a 4k ida, that's... bare minimum around 128k memory that has to be allocated somewhere else, and in a single subsystem, assuming 16 byte structs - and typically it'll be many times that. 32k memory in the ida - 2 mb in the subsystem, absolute minimum. If you're convinced this is a real issue though - how about IDA_SECTION_SIZE conditional on CONFIG_COMPACTION, so we use order 2 or 3 allocations if CONFIG_COMPACTION=n? Then the max size toplevel array of pointers to segments would be bigger, but that's only an issue when we're allocating up to near INT_MAX ids, so it's difficult to see how _that_ would be an issue on a small/embedded system... and we could even use vmalloc for that allocation when the size of that array is IDA_SECTION_SIZE. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Tue, Aug 13, 2013 at 06:19:28PM -0400, Tejun Heo wrote: Hello, On Tue, Aug 13, 2013 at 03:13:08PM -0700, Kent Overstreet wrote: If you're convinced this is a real issue though - how about It is a real issue. Large order allocation is fine for optimization but shouldn't be depended upon. It does fail easily without compaction and compaction is heavy-ass operation which will blow up any minute performance advantage you might get from avoiding proper radix tree implementation. IDA_SECTION_SIZE conditional on CONFIG_COMPACTION, so we use order 2 or 3 allocations if CONFIG_COMPACTION=n? Then the max size toplevel array of pointers to segments would be bigger, but that's only an issue when we're allocating up to near INT_MAX ids, so it's difficult to see how _that_ would be an issue on a small/embedded system... and we could even use vmalloc for that allocation when the size of that array is IDA_SECTION_SIZE. What about cyclic allocations then? This is natrually a radix tree problem. I don't know why you're resisting radix tree so much here. It's only naturally a radix tree problem _if_ you require sparseness. Otherwise, radix trees require pointer chasing, which we can avoid - which saves us both the cost of chasing pointers (which is significant) and the overhead of storing them. The patch handles cyclic allocation by limiting sparseness - we talked about this and I thought you were ok with this solution, though it was awhile ago and I could be misremembering your comments. To recap, here's the code that implements that sparseness limiting, it's documented in ida_alloc_cyclic()'s docs: static int __ida_alloc_cyclic(struct ida *ida, unsigned start, unsigned end, gfp_t gfp, unsigned long *flags) __releases(ida-lock) __acquires(ida-lock) { int ret; unsigned id; ret = __ida_alloc_range_multiple(ida, id, 1, max(start, ida-cur_id), end, gfp, flags); if (ret 0) ret = __ida_alloc_range_multiple(ida, id, 1, start, end, gfp, flags); if (ret == 1) { ida-cur_id = id + 1; if ((ida-cur_id - start) / 2 max(1024U, ida-allocated_ids)) ida-cur_id = 0; return id; } return ret; } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Fri, Aug 09, 2013 at 10:57:56AM -0400, Tejun Heo wrote: Hello, On Wed, Aug 07, 2013 at 01:51:17PM -0700, Kent Overstreet wrote: + * So if the max section size is 64k, that's ~4096 sections, with 8 byte + * pointers that's a little over 32k for the pointers to sections. + * + * That means max size sections are order 4 page allocations. Order 4 allocations for common data structure doesn't really sound like a good idea to me. It's gonna work fine on relatively large machines but suck on mobile / small embedded devices, many of which are still struggling with 32bit address space and compaction may not be enabled. It just doens't make sense to me to impose 64k allocations from low level library function like ida. Would this be an acceptable solution? From 483cfa0c809b7dc3b0abad93407468f273416578 Mon Sep 17 00:00:00 2001 From: Kent Overstreet k...@daterainc.com Date: Tue, 13 Aug 2013 15:31:20 -0700 Subject: [PATCH] ida: Use order 2 allocations when COMPACTION=n And fall back to vmalloc for the array of pointers to sections so we can still allocate up to INT_MAX ids. diff --git a/lib/idr.c b/lib/idr.c index 02a221c..3bffb52 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -21,16 +21,20 @@ #include linux/export.h #include linux/idr.h #include linux/kernel.h +#include linux/mm.h #include linux/percpu.h #include linux/rcupdate.h #include linux/sched.h #include linux/slab.h #include linux/string.h #include linux/spinlock.h +#include linux/vmalloc.h static void kgfree(void *ptr, size_t size) { - if (size PAGE_SIZE) + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else if (size PAGE_SIZE) kfree(ptr); else free_pages((unsigned long) ptr, get_order(size)); @@ -102,7 +106,14 @@ static void *kgalloc(size_t size, gfp_t gfp) */ #define IDA_TREE_ARY BITS_PER_LONG -#define IDA_SECTION_SIZE (64UL 10) + +/* Max section size, in bytes */ +#ifdef CONFIG_COMPACTION +#define IDA_SECTION_SIZE (64UL 10) /* order 4 page allocation */ +#else +#define IDA_SECTION_SIZE (16UL 10) /* order 2 */ +#endif + #define IDA_NODES_PER_SECTION (IDA_SECTION_SIZE / sizeof(unsigned long)) static inline unsigned long *ida_index_to_node(struct ida *ida, unsigned node) @@ -251,8 +262,15 @@ again: if (ida-nodes = IDA_NODES_PER_SECTION is_power_of_2(cur_sections)) { - sections = kgalloc(cur_sections * 2 * sizeof(unsigned long *), - __GFP_ZERO|gfp); + size_t bytes = cur_sections * 2 * sizeof(unsigned long *); + + if (bytes = IDA_SECTION_SIZE) + sections = kgalloc(bytes, __GFP_ZERO|gfp); + else if (gfp GFP_KERNEL) + sections = vzalloc(bytes); + else + sections = NULL; + if (!sections) goto err; } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Tue, Aug 13, 2013 at 06:44:28PM -0400, Tejun Heo wrote: Hello, Kent. On Tue, Aug 13, 2013 at 03:27:59PM -0700, Kent Overstreet wrote: It's only naturally a radix tree problem _if_ you require sparseness. Well, it's not necessarily about requiring it but more about surviving it with some grace when things don't go as expected, which is an important characteristic for common library stuff. The patch I posted should solve the high order allocations stuff, and sparseness from cyclic allocations was already solved. Otherwise, radix trees require pointer chasing, which we can avoid - which saves us both the cost of chasing pointers (which is significant) and the overhead of storing them. Vast majority of which can be avoided with simple caching, right? Whatever caching optimizations you do with a radix tree version I could apply to this bitmap tree version, and my bitmap tree code is simpler and _considerably_ faster than the existing code. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Tue, Aug 13, 2013 at 07:22:11PM -0400, Tejun Heo wrote: Hello, On Tue, Aug 13, 2013 at 03:59:27PM -0700, Kent Overstreet wrote: Well, it's not necessarily about requiring it but more about surviving it with some grace when things don't go as expected, which is an important characteristic for common library stuff. The patch I posted should solve the high order allocations stuff, and sparseness from cyclic allocations was already solved. I don't know. Yeah, using vmalloc would be able to work around the issue for most cases, I suppose. It's iffy to consume vmalloc space from ida, which functionally is such a basic algorithmic construct. It probably won't worsen things noticeably but vmalloc area can be a very precious resource on 32bit configs. This is only using it for the array of pointers to sections though, not the bitmap itself - and only when that allocations is 16k. For INT_MAX allocated ids (absolute worst case) we'd be using 256k of vmalloc memory on 64 bit, half that on 32 bit. Whatever caching optimizations you do with a radix tree version I could apply to this bitmap tree version, and my bitmap tree code is simpler and _considerably_ faster than the existing code. But the difference won't really matter. Cached performance would be the same and that's likely to cover most cases, right? It's not like radix tree is orders of magnitude slower. Should probably be almost as good, yeah... in theory, but the space efficiency still isn't going to be as good, and it'll probably be more code... and at this point I really just don't want to futz with it more. At this point unless there's something really wrong with this code I just want to move onto something else :P -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] bcache fixes for 3.11
Jens, here's the latest bcache fixes. Some urgent stuff in here: The following changes since commit 79826c35eb99cd3c0873b8396f45fa26c87fb0b0: bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11 for you to fetch changes up to 0434a516e99ee51ac2d0dfa71b2f56c89ac5db05: bcache: Fix a flush/fua performance bug (2013-08-14 15:44:51 -0700) Gabriel de Perthuis (1): bcache: Strip endline when writing the label through sysfs Geert Uytterhoeven (1): bcache: Correct printf()-style format length modifier Kent Overstreet (4): bcache: Fix a dumb journal discard bug bcache: Fix for when no journal entries are found bcache: Fix a writeback performance regression bcache: Fix a flush/fua performance bug drivers/md/bcache/bcache.h| 7 +++ drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/journal.c | 33 - drivers/md/bcache/sysfs.c | 9 +++-- drivers/md/bcache/util.c | 11 ++- drivers/md/bcache/util.h | 12 +--- drivers/md/bcache/writeback.c | 37 ++--- 7 files changed, 68 insertions(+), 43 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] idr: Document ida tree sections
On Tue, Aug 13, 2013 at 07:59:47PM -0400, Tejun Heo wrote: Hey, Kent. On Tue, Aug 13, 2013 at 04:51:33PM -0700, Kent Overstreet wrote: Should probably be almost as good, yeah... in theory, but the space efficiency still isn't going to be as good, and it'll probably be more code... and at this point I really just don't want to futz with it more. At this point unless there's something really wrong with this code I just want to move onto something else :P I think it probably would be okay in most cases but don't feel confident about acking as it's making trade-offs which are unnecessary and unusual. So, ummm, I really don't know. Maybe it's better enough than what we have now but at the same time if you want to reimplement the whole thing you should be persistent / reliable enough to see it through this time around too, right? I was just telling you how I felt :) Regardless of that, IMO what I've got now is superior to any radix tree based approach for what ida/idr are supposed to do. I could of course be wrong, but I'm not convinced... Re: caching the last allocation with a radix tree based implementation. I thought about that more last night, I don't think that would be viable for using ida underneath the percpu ida allocator. Reason being percpu ida has to heavily optimize for the case where almost all of the id space is allocated, and after awhile the id space is going to be fragmented - so caching the last allocated id is going to be useless. This is also why I implemented the ganged allocation bit, to amortize the bitmap tree traversal. So we'd lose out on that going back to a radix tree, or have to reimplement it (and it'll be slower due to pointer chasing). Which is all not the end of the world, but it means that if we punt on the ida/idr rewrites for now or change our minds about them - we _do_ have quite a bit of stuff waiting on the percpu ida allocator, so for that to go in separate I'll have to change it back to using a stack of integers for the global freelist - which does use significantly more memory. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [GIT PULL] bcache fixes for 3.11
On Thu, Aug 15, 2013 at 08:43:22AM +0200, Stefan Priebe - Profihost AG wrote: Am 15.08.2013 00:59, schrieb Kent Overstreet: Jens, here's the latest bcache fixes. Some urgent stuff in here: The following changes since commit 79826c35eb99cd3c0873b8396f45fa26c87fb0b0: bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11 for you to fetch changes up to 0434a516e99ee51ac2d0dfa71b2f56c89ac5db05: bcache: Fix a flush/fua performance bug (2013-08-14 15:44:51 -0700) Gabriel de Perthuis (1): bcache: Strip endline when writing the label through sysfs Geert Uytterhoeven (1): bcache: Correct printf()-style format length modifier Kent Overstreet (4): bcache: Fix a dumb journal discard bug bcache: Fix for when no journal entries are found bcache: Fix a writeback performance regression bcache: Fix a flush/fua performance bug drivers/md/bcache/bcache.h| 7 +++ drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/journal.c | 33 - drivers/md/bcache/sysfs.c | 9 +++-- drivers/md/bcache/util.c | 11 ++- drivers/md/bcache/util.h | 12 +--- drivers/md/bcache/writeback.c | 37 ++--- 7 files changed, 68 insertions(+), 43 deletions(-) -- As 3.10 is long term stable release. You might need CC the stable list for patches which have to go to 3.10 as well. At least this one should go to 3.10: Gabriel de Perthuis (1): bcache: Strip endline when writing the label through sysfs That patch is fixing a bug in code that was merged for 3.11 - the rest are marked for stable :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 4/4] idr: Percpu ida
On Tue, Jun 18, 2013 at 02:14:53PM +, Christoph Lameter wrote: On Mon, 17 Jun 2013, Kent Overstreet wrote: +static inline unsigned alloc_local_tag(struct percpu_ida *pool, + struct percpu_ida_cpu *tags) +{ + int tag = -ENOSPC; + + spin_lock(tags-lock); + if (tags-nr_free) + tag = tags-freelist[--tags-nr_free]; + spin_unlock(tags-lock); + + return tag; +} This could be made much faster by avoiding real atomics (coming with spinlocks) and using per cpu atomics instead. Slub f.e. uses a single linked per cpu list managed via this_cpu_cmpxchg. Actually, we do need the atomic ops - they're protecting against a different cpu grabbing our freelist with steal_tags(). The alternative (which Andrew Morton suggested and Jens implemented in his fork of an earlier version of this code) would be to use IPIs, but there are reasonable use cases (high ratio of cpus:nr_tags) where tag stealing is going to be reasonably common so we don't want it to suck too much. Plus, steal_tags() needs to try different cpu's freelists until it finds one with tags, IPIs would make that loop... problematic. I actually originally used cmpxchg(), but then later in review I realized I had an ABA and couldn't figure out how to solve it, so that's when I switched to spinlocks (which Andrew Morton preferred anyways). But I just realized a few minutes ago I've seen your solution to ABA with a stack, in the slub code - double word cmpxchg() with a transaction id :) I may try that again just to see how the code looks, but I'm not sure the difference in performance will be big enough to matter, give that this lock should basically never be contended. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] lib/idr.c rewrite, percpu ida/tag allocator
This is the second iteration of patches 1-4 - there's only been a few trivial bugfixes for those. The rest is the idr rewrite - I reimplemented it on top of the new ida implementation and the existing radix tree implementation. Patch series is available in my git repo: git://evilpiepirate.org/~kent/linux-bcache.git idr arch/powerpc/mm/icswx_pid.c | 36 +- arch/powerpc/mm/mmu_context_hash64.c | 28 +- block/blk-core.c |4 +- block/blk-sysfs.c |2 +- block/bsg.c |2 +- block/genhd.c |2 +- drivers/atm/nicstar.c |4 +- drivers/base/platform.c |6 +- drivers/base/soc.c| 18 +- drivers/block/drbd/drbd_main.c|6 +- drivers/block/drbd/drbd_nl.c |2 +- drivers/block/loop.c |4 +- drivers/block/mtip32xx/mtip32xx.c | 24 +- drivers/block/nvme-core.c | 33 +- drivers/block/rsxx/core.c | 21 +- drivers/block/virtio_blk.c|6 +- drivers/dca/dca-sysfs.c | 18 +- drivers/dma/dmaengine.c |2 +- drivers/firewire/core-cdev.c |5 +- drivers/firewire/core-device.c|2 +- drivers/gpio/gpiolib.c|2 +- drivers/gpu/drm/drm_context.c |2 +- drivers/gpu/drm/drm_crtc.c|2 +- drivers/gpu/drm/drm_gem.c |8 +- drivers/gpu/drm/drm_stub.c|2 +- drivers/gpu/drm/exynos/exynos_drm_ipp.c |2 +- drivers/gpu/drm/i915/i915_gem_context.c |2 +- drivers/gpu/drm/qxl/qxl_cmd.c |4 +- drivers/gpu/drm/qxl/qxl_drv.h |1 - drivers/gpu/drm/qxl/qxl_kms.c |1 - drivers/gpu/drm/qxl/qxl_release.c | 19 +- drivers/gpu/drm/sis/sis_mm.c |2 +- drivers/gpu/drm/via/via_mm.c |2 +- drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c | 31 +- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c |4 +- drivers/hwmon/hwmon.c |6 +- drivers/hwmon/ibmaem.c| 10 +- drivers/i2c/i2c-core.c|4 +- drivers/iio/industrialio-core.c |4 +- drivers/iio/industrialio-trigger.c|6 +- drivers/infiniband/core/cm.c |7 +- drivers/infiniband/core/cma.c |2 +- drivers/infiniband/core/sa_query.c|4 +- drivers/infiniband/core/ucm.c |2 +- drivers/infiniband/core/ucma.c|4 +- drivers/infiniband/core/uverbs_cmd.c |4 +- drivers/infiniband/hw/amso1100/c2.h |1 - drivers/infiniband/hw/amso1100/c2_qp.c| 20 +- drivers/infiniband/hw/cxgb3/iwch.h|4 +- drivers/infiniband/hw/cxgb4/iw_cxgb4.h|4 +- drivers/infiniband/hw/ehca/ehca_cq.c |4 +- drivers/infiniband/hw/ehca/ehca_qp.c |4 +- drivers/infiniband/hw/ipath/ipath_driver.c|4 +- drivers/infiniband/hw/mlx4/cm.c |2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c|2 +- drivers/infiniband/hw/qib/qib_init.c |4 +- drivers/input/input.c |6 +- drivers/iommu/iommu.c | 18 +- drivers/ipack/ipack.c |4 +- drivers/md/dm.c | 22 +- drivers/memstick/core/memstick.c | 17 +- drivers/memstick/core/mspro_block.c |2 +- drivers/mfd/rtsx_pcr.c| 13 +- drivers/misc/c2port/core.c| 11 +- drivers/misc/cb710/core.c | 19 +- drivers/misc/tifm_core.c | 15 +- drivers/mmc/core/host.c | 13 +- drivers/mtd/mtdcore.c |4 +- drivers/net/macvtap.c |2 +- drivers/net/ppp/ppp_generic.c |4 +- drivers/power/bq2415x_charger.c |2 +- drivers/power/bq27x00_battery.c |2 +- drivers/power/ds2782_battery.c|2 +- drivers/pps/kapi.c|2 +- drivers/pps/pps.c |4 +- drivers/ptp/ptp_clock.c |4 +- drivers/remoteproc/remoteproc_core.c |8 +- drivers/rpmsg/virtio_rpmsg_bus.c |4 +- drivers/rtc/class.c |6 +- drivers/scsi/bfa/bfad_im.c|2 +- drivers/scsi/ch.c | 14 +- drivers/scsi/lpfc/lpfc_init.c |2 +-
[PATCH 03/10] idr: Rewrite ida
This is a new, from scratch implementation of ida that should be simpler, faster and more space efficient. Two primary reasons for the rewrite: * A future patch will reimplement idr on top of this ida implementation + radix trees. Once that's done, the end result will be ~1k fewer lines of code, much simpler and easier to understand and it should be quite a bit faster. * The performance improvements and addition of ganged allocation should make ida more suitable for use by a percpu id/tag allocator, which would then act as a frontend to this allocator. The old ida implementation was done with the idr data structures - this was IMO backwards. I'll soon be reimplementing idr on top of this new ida implementation and radix trees - using a separate dedicated data structure for the free ID bitmap should actually make idr faster, and the end result is _significantly_ less code. This implementation conceptually isn't that different from the old one - it's a tree of bitmaps, where one bit in a given node indicates whether or not there are free bits in a child node. The main difference (and advantage) over the old version is that the tree isn't implemented with pointers - it's implemented in an array, like how heaps are implemented, which both better space efficiency and it'll be faster since there's no pointer chasing. This does mean that the entire bitmap is stored in one contiguous memory allocation - and as currently implemented we won't be able to allocate _quite_ as many ids as with the previous implementation. I don't expect this to be an issue in practice since anywhere this is used, an id corresponds to a struct allocation somewher else - we can't allocate an unbounded number of ids, we'll run out of memory somewhere else eventually, and I expect that to be the limiting factor in practice. If a user/use case does come up where this matters I can add some sharding (or perhaps add a separate big_ida implementation) - but the extra complexity would adversely affect performance for the users that don't need millions of ids, so I intend to leave the implementation as is until if and when this becomes an issue. Signed-off-by: Kent Overstreet koverstr...@google.com Cc: Andrew Morton a...@linux-foundation.org Cc: Tejun Heo t...@kernel.org Cc: Stephen Rothwell s...@canb.auug.org.au Cc: Fengguang Wu fengguang...@intel.com --- include/linux/idr.h | 118 +--- lib/idr.c | 776 +--- 2 files changed, 573 insertions(+), 321 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index c0e0c54..a78cf99 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -17,6 +17,88 @@ #include linux/init.h #include linux/rcupdate.h +/* IDA */ + +#define IDA_INLINE_NODES 1 + +struct ida { + spinlock_t lock; + + /* +* cur_id and allocated_ids are for ida_alloc_cyclic. For cyclic +* allocations we search for new ids to allocate starting from the last +* id allocated - cur_id is the next id to try allocating. +* +* But we also don't want the allocated ids to be arbitrarily sparse - +* the memory usage for the bitmap could be arbitrarily bad, and if +* they're used as keys in a radix tree the memory overhead of the radix +* tree could be quite bad as well. So we use allocated_ids to decide +* when to restart cur_id from 0, and bound how sparse the bitmap can +* be. +*/ + unsignedcur_id; + unsignedallocated_ids; + + /* size of ida-tree */ + unsignednodes; + + /* +* Index of first leaf node in ida-tree; equal to the number of non +* leaf nodes, ida-nodes - ida-first_leaf == number of leaf nodes +*/ + unsignedfirst_leaf; + + unsigned long *tree; + unsigned long inline_nodes[IDA_INLINE_NODES]; +}; + +#define IDA_INIT(name) \ +{ \ + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .nodes = IDA_INLINE_NODES, \ + .first_leaf = 1,\ + .tree = name.inline_nodes,\ +} +#define DEFINE_IDA(name) struct ida name = IDA_INIT(name) + +void ida_remove(struct ida *ida, unsigned id); +int ida_alloc_range(struct ida *ida, unsigned int start, + unsigned int end, gfp_t gfp); +int ida_alloc_cyclic(struct ida *ida, unsigned start, unsigned end, gfp_t gfp); +void ida_destroy(struct ida *ida); +int ida_init_prealloc(struct ida *ida, unsigned prealloc); + +/** + * ida_alloc_range - allocate a new id. + * @ida: the (initialized) ida. + * @gfp_mask: memory allocation flags + * + * Allocates an id in the range [0, INT_MAX]. Returns
[PATCH 06/10] idr: Rename idr_get_next() - idr_find_next()
get() implies taking a ref or sometimes an allocation, which this function definitely does not do - rename it to something more sensible. Signed-off-by: Kent Overstreet koverstr...@google.com Cc: Andrew Morton a...@linux-foundation.org Cc: Tejun Heo t...@kernel.org --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_nl.c | 2 +- drivers/mtd/mtdcore.c | 2 +- include/linux/idr.h| 4 ++-- lib/idr.c | 6 +++--- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a5dca6a..b84e4b2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -500,7 +500,7 @@ int conn_lowest_minor(struct drbd_tconn *tconn) int vnr = 0, m; rcu_read_lock(); - mdev = idr_get_next(tconn-volumes, vnr); + mdev = idr_find_next(tconn-volumes, vnr); m = mdev ? mdev_to_minor(mdev) : -1; rcu_read_unlock(); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9e3f441..8fa7eb1 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2837,7 +2837,7 @@ int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) } if (tconn) { next_tconn: - mdev = idr_get_next(tconn-volumes, volume); + mdev = idr_find_next(tconn-volumes, volume); if (!mdev) { /* No more volumes to dump on this tconn. * Advance tconn iterator. */ diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index c400c57..eaa1fcc 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(mtd_table_mutex); struct mtd_info *__mtd_next_device(int i) { - return idr_get_next(mtd_idr, i); + return idr_find_next(mtd_idr, i); } EXPORT_SYMBOL_GPL(__mtd_next_device); diff --git a/include/linux/idr.h b/include/linux/idr.h index 35df6c7..ae3a3b4 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -209,7 +209,7 @@ int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask); int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask); int idr_for_each(struct idr *idp, int (*fn)(int id, void *p, void *data), void *data); -void *idr_get_next(struct idr *idp, int *nextid); +void *idr_find_next(struct idr *idp, int *nextid); void *idr_replace(struct idr *idp, void *ptr, int id); void idr_remove(struct idr *idp, int id); void idr_free(struct idr *idp, int id); @@ -260,7 +260,7 @@ static inline void *idr_find(struct idr *idr, int id) * is convenient for a not found value. */ #define idr_for_each_entry(idp, entry, id) \ - for (id = 0; ((entry) = idr_get_next(idp, (id))) != NULL; ++id) + for (id = 0; ((entry) = idr_find_next(idp, (id))) != NULL; ++id) void __init idr_init_cache(void); diff --git a/lib/idr.c b/lib/idr.c index dae0920..21f86e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -1506,7 +1506,7 @@ int idr_for_each(struct idr *idp, EXPORT_SYMBOL(idr_for_each); /** - * idr_get_next - lookup next object of id to given id. + * idr_find_next - lookup next object of id to given id. * @idp: idr handle * @nextidp: pointer to lookup key * @@ -1517,7 +1517,7 @@ EXPORT_SYMBOL(idr_for_each); * This function can be called under rcu_read_lock(), given that the leaf * pointers lifetimes are correctly managed. */ -void *idr_get_next(struct idr *idp, int *nextidp) +void *idr_find_next(struct idr *idp, int *nextidp) { struct idr_layer *p, *pa[MAX_IDR_LEVEL + 1]; struct idr_layer **paa = pa[0]; @@ -1558,7 +1558,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) } return NULL; } -EXPORT_SYMBOL(idr_get_next); +EXPORT_SYMBOL(idr_find_next); /** -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/10] idr: Kill old deprecated idr interfaces
The deprecated idr interfaces don't have any in kernel users, so let's delete them as prep work for the idr rewrite. Signed-off-by: Kent Overstreet koverstr...@google.com Cc: Andrew Morton a...@linux-foundation.org Cc: Tejun Heo t...@kernel.org --- include/linux/idr.h | 63 - lib/idr.c | 36 +++--- 2 files changed, 3 insertions(+), 96 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ba796b..35df6c7 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -262,69 +262,6 @@ static inline void *idr_find(struct idr *idr, int id) #define idr_for_each_entry(idp, entry, id) \ for (id = 0; ((entry) = idr_get_next(idp, (id))) != NULL; ++id) -/* - * Don't use the following functions. These exist only to suppress - * deprecated warnings on EXPORT_SYMBOL()s. - */ -int __idr_pre_get(struct idr *idp, gfp_t gfp_mask); -int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id); -void __idr_remove_all(struct idr *idp); - -/** - * idr_pre_get - reserve resources for idr allocation - * @idp: idr handle - * @gfp_mask: memory allocation flags - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_pre_get(struct idr *idp, gfp_t gfp_mask) -{ - return __idr_pre_get(idp, gfp_mask); -} - -/** - * idr_get_new_above - allocate new idr entry above or equal to a start id - * @idp: idr handle - * @ptr: pointer you want associated with the id - * @starting_id: id to start search at - * @id: pointer to the allocated handle - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_get_new_above(struct idr *idp, void *ptr, -int starting_id, int *id) -{ - return __idr_get_new_above(idp, ptr, starting_id, id); -} - -/** - * idr_get_new - allocate new idr entry - * @idp: idr handle - * @ptr: pointer you want associated with the id - * @id: pointer to the allocated handle - * - * Part of old alloc interface. This is going away. Use - * idr_preload[_end]() and idr_alloc() instead. - */ -static inline int __deprecated idr_get_new(struct idr *idp, void *ptr, int *id) -{ - return __idr_get_new_above(idp, ptr, 0, id); -} - -/** - * idr_remove_all - remove all ids from the given idr tree - * @idp: idr handle - * - * If you're trying to destroy @idp, calling idr_destroy() is enough. - * This is going away. Don't use. - */ -static inline void __deprecated idr_remove_all(struct idr *idp) -{ - __idr_remove_all(idp); -} - void __init idr_init_cache(void); #endif /* __IDR_H__ */ diff --git a/lib/idr.c b/lib/idr.c index 4962e5d..dae0920 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -982,19 +982,6 @@ static void idr_mark_full(struct idr_layer **pa, int id) } } -int __idr_pre_get(struct idr *idp, gfp_t gfp_mask) -{ - while (idp-id_free_cnt MAX_IDR_FREE) { - struct idr_layer *new; - new = kmem_cache_zalloc(idr_layer_cache, gfp_mask); - if (new == NULL) - return (0); - move_to_free_list(idp, new); - } - return 1; -} -EXPORT_SYMBOL(__idr_pre_get); - /** * sub_alloc - try to allocate an id without growing the tree depth * @idp: idr handle @@ -1160,21 +1147,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, int id, idr_mark_full(pa, id); } -int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id) -{ - struct idr_layer *pa[MAX_IDR_LEVEL + 1]; - int rv; - - rv = idr_get_empty_slot(idp, starting_id, pa, 0, idp); - if (rv 0) - return rv == -ENOMEM ? -EAGAIN : rv; - - idr_fill_slot(idp, ptr, rv, pa); - *id = rv; - return 0; -} -EXPORT_SYMBOL(__idr_get_new_above); - /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading @@ -1395,7 +1367,7 @@ void idr_remove(struct idr *idp, int id) } EXPORT_SYMBOL(idr_remove); -void __idr_remove_all(struct idr *idp) +static void __idr_remove_all(struct idr *idp) { int n, id, max; int bt_mask; @@ -1428,7 +1400,6 @@ void __idr_remove_all(struct idr *idp) } idp-layers = 0; } -EXPORT_SYMBOL(__idr_remove_all); /** * idr_destroy - release all cached layers within an idr tree @@ -1490,13 +1461,12 @@ EXPORT_SYMBOL(idr_find_slowpath); * callback function will be called for each pointer currently * registered, passing the id, the pointer and the data pointer passed * to this function. It is not safe to modify the idr tree while in - * the callback, so functions such as idr_get_new and idr_remove are - * not allowed. + * the callback, so functions such as idr_remove are not allowed. * * We check
[PATCH 08/10] idr: Reimplement idr on top of ida/radix trees
The old idr code was really a second radix tree implementation - we already have one in lib/radix-tree.c. This patch reimplements idr on top of our existing radix trees, using our shiny new ida implementation for allocating/freeing the ids. The old idr code was noticably slower than lib/radix-tree.c in at least some benchmarks, so in addition to being ~500 lines less code this patch should improve performance too. There's one thing left unfinished in this patch - the existing idr_preload() interface won't work for ida. Another patch on top of this will fix idr_preload() and update existing users to the new interface. Signed-off-by: Kent Overstreet koverstr...@google.com Cc: Andrew Morton a...@linux-foundation.org Cc: Tejun Heo t...@kernel.org --- include/linux/idr.h | 157 - init/main.c | 1 - lib/idr.c | 896 ++-- 3 files changed, 249 insertions(+), 805 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 4bce55c..ec789f5 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -1,6 +1,6 @@ /* * include/linux/idr.h - * + * * 2002-10-18 written by Jim Houston jim.hous...@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * Distributed under the GNU GPL license version 2. @@ -12,10 +12,8 @@ #ifndef __IDR_H__ #define __IDR_H__ -#include linux/types.h -#include linux/bitops.h -#include linux/init.h -#include linux/rcupdate.h +#include linux/gfp.h +#include linux/radix-tree.h #include linux/spinlock_types.h #include linux/wait.h @@ -147,74 +145,42 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); /* IDR */ -/* - * We want shallower trees and thus more bits covered at each layer. 8 - * bits gives us large enough first layer for most use cases and maximum - * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and - * 1k on 32bit. +/** + * DOC: idr sync + * idr synchronization (stolen from radix-tree.h) + * + * idr_alloc() and idr_remove() do their own locking internally - the user need + * not be concerned with synchronization unless there's other operations that + * need to be done atomically. + * + * idr_find() does no locking - it can be called locklessly using RCU, if the + * caller ensures calls to this function are made within rcu_read_lock() + * regions and does all the other appropriate RCU stuff. */ -#define IDR_BITS 8 -#define IDR_SIZE (1 IDR_BITS) -#define IDR_MASK ((1 IDR_BITS)-1) - -struct idr_layer { - int prefix; /* the ID prefix of this idr_layer */ - DECLARE_BITMAP(bitmap, IDR_SIZE); /* A zero bit means space here */ - struct idr_layer __rcu *ary[1IDR_BITS]; - int count; /* When zero, we can release it */ - int layer; /* distance from leaf */ - struct rcu_head rcu_head; -}; struct idr { - struct idr_layer __rcu *hint; /* the last layer allocated from */ - struct idr_layer __rcu *top; - struct idr_layer*id_free; - int layers; /* only valid w/o concurrent changes */ - int id_free_cnt; - int cur;/* current pos for cyclic allocation */ - spinlock_t lock; + struct ida ida; + struct radix_tree_root ptrs; }; #define IDR_INIT(name) \ { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .ida= IDA_INIT(name.ida), \ + .ptrs = RADIX_TREE_INIT(GFP_NOWAIT), \ } #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) -/** - * DOC: idr sync - * idr synchronization (stolen from radix-tree.h) - * - * idr_find() is able to be called locklessly, using RCU. The caller must - * ensure calls to this function are made within rcu_read_lock() regions. - * Other readers (lock-free or otherwise) and modifications may be running - * concurrently. - * - * It is still required that the caller manage the synchronization and - * lifetimes of the items. So if RCU lock-free lookups are used, typically - * this would mean that the items have their own locks, or are amenable to - * lock-free access; and that the items are freed by RCU (or only freed after - * having been deleted from the idr tree *and* a synchronize_rcu() grace - * period). - */ - -/* - * This is what we export. - */ - -void *idr_find_slowpath(struct idr *idp, int id); -void idr_preload(gfp_t gfp_mask); -int idr_alloc_range(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask); -int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask); -int idr_for_each(struct idr *idp, +void *idr_find_next(struct idr *idr, int *nextid); +int
[PATCH 04/10] idr: Percpu ida
Percpu frontend for allocating ids. With percpu allocation (that works), it's impossible to guarantee it will always be possible to allocate all nr_tags - typically, some will be stuck on a remote percpu freelist where the current job can't get to them. We do guarantee that it will always be possible to allocate at least (nr_tags / 2) tags - this is done by keeping track of which and how many cpus have tags on their percpu freelists. On allocation failure if enough cpus have tags that there could potentially be (nr_tags / 2) tags stuck on remote percpu freelists, we then pick a remote cpu at random to steal from. Note that the synchronization is _definitely_ tricky - we're using xchg()/cmpxchg() on the percpu lists, to synchronize between steal_tags(). The alternative would've been adding a spinlock to protect the percpu freelists, but that would've required some different tricky code to avoid deadlock because of the lock ordering. Note that there's no cpu hotplug notifier - we don't care, because steal_tags() will eventually get the down cpu's tags. We _could_ satisfy more allocations if we had a notifier - but we'll still meet our guarantees and it's absolutely not a correctness issue, so I don't think it's worth the extra code. Signed-off-by: Kent Overstreet koverstr...@google.com Cc: Tejun Heo t...@kernel.org Cc: Oleg Nesterov o...@redhat.com Cc: Christoph Lameter c...@linux-foundation.org Cc: Ingo Molnar mi...@redhat.com Cc: Andi Kleen a...@firstfloor.org Cc: Jens Axboe ax...@kernel.dk Cc: Nicholas A. Bellinger n...@linux-iscsi.org --- include/linux/idr.h | 48 lib/idr.c | 312 ++-- 2 files changed, 352 insertions(+), 8 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index a78cf99..3ba796b 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -16,6 +16,8 @@ #include linux/bitops.h #include linux/init.h #include linux/rcupdate.h +#include linux/spinlock_types.h +#include linux/wait.h /* IDA */ @@ -97,6 +99,52 @@ static inline void ida_init(struct ida *ida) ida_init_prealloc(ida, 0); } +/* Percpu IDA/tag allocator */ + +struct percpu_ida_cpu; + +struct percpu_ida { + /* +* number of tags available to be allocated, as passed to +* percpu_ida_init() +*/ + unsignednr_tags; + + struct percpu_ida_cpu __percpu *tag_cpu; + + /* +* Bitmap of cpus that (may) have tags on their percpu freelists: +* steal_tags() uses this to decide when to steal tags, and which cpus +* to try stealing from. +* +* It's ok for a freelist to be empty when its bit is set - steal_tags() +* will just keep looking - but the bitmap _must_ be set whenever a +* percpu freelist does have tags. +*/ + unsigned long *cpus_have_tags; + + struct { + /* +* When we go to steal tags from another cpu (see steal_tags()), +* we want to pick a cpu at random. Cycling through them every +* time we steal is a bit easier and more or less equivalent: +*/ + unsignedcpu_last_stolen; + + /* For sleeping on allocation failure */ + wait_queue_head_t wait; + + /* Global freelist */ + struct ida ida; + } cacheline_aligned_in_smp; +}; + +int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp); +void percpu_ida_free(struct percpu_ida *pool, unsigned tag); + +void percpu_ida_destroy(struct percpu_ida *pool); +int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags); + /* IDR */ /* diff --git a/lib/idr.c b/lib/idr.c index e6b837a..4962e5d 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -28,17 +28,20 @@ * with the slab allocator. */ -#ifndef TEST// to test in user space... -#include linux/slab.h -#include linux/init.h -#include linux/export.h -#endif +#include linux/bitmap.h +#include linux/bitops.h +#include linux/bug.h #include linux/err.h -#include linux/string.h +#include linux/export.h +#include linux/hardirq.h #include linux/idr.h -#include linux/spinlock.h +#include linux/init.h +#include linux/kernel.h #include linux/percpu.h -#include linux/hardirq.h +#include linux/sched.h +#include linux/slab.h +#include linux/string.h +#include linux/spinlock.h #define IDA_TREE_ARY BITS_PER_LONG @@ -527,6 +530,299 @@ int ida_init_prealloc(struct ida *ida, unsigned prealloc) } EXPORT_SYMBOL(ida_init_prealloc); +/* Percpu IDA */ + +/* + * Number of tags we move between the percpu freelist and the global freelist at + * a time + */ +#define TAG_CPU_BATCH_MOVE 32U + +/* Max size of percpu freelist, */ +#define TAG_CPU_SIZE (TAG_CPU_BATCH_MOVE + TAG_CPU_BATCH_MOVE / 2) + +struct percpu_ida_cpu { + spinlock_t
Re: [PATCH] Percpu tag allocator
On Thu, Jun 13, 2013 at 12:23:39PM -0700, Andrew Morton wrote: On Thu, 13 Jun 2013 12:15:07 -0700 Tejun Heo t...@kernel.org wrote: Oh, I'm sure the current id[r|a] can be improved upon a lot but I'm very skeptical one can reach the level of scalability necessary for, say, pci-e attached extremely high-iops devices while still keeping the lowest number allocation, which can't be achieved without strong synchronization on each alloc/free. Maybe we can layer things so that we have percpu layer on top of id[r|a] and, say, mapping id to point is still done by idr, or the percpu tag allocator uses ida for tag chunk allocations, but it's still gonna be something extra on top. It's not obvious that explicit per-cpu is needed. Get an ID from ida_get_new_above(), multiply it by 16 and store that in device-local storage, along with a 16-bit bitmap. Blam, 30 lines of code and the ida_get_new_above() cost is reduced 16x and it's off the map. I was just rereading emails and realized I should've replied to this. So, I started out aiming for something simpler. I don't quite follow what the approach you're suggesting is, but it ends up you really do need the percpu freelists (buffering batching/freeing from the main freelist) because ids/tags may not be freed on the cpu they were allocated on. In particular, if this is for a driver and the device doesn't implement per cpu queues, tags are almost always going to be freed on a different cpu. If you just give each cpu a fraction of the tag space they always allocate out of (with ida_get_new_above() or similar) - that only helps allocation, half the cacheline contention is freeing. I originally wrote this tag allocator to use in a driver for a device that didn't support multiple hardware queues at all, but it was fast enough that any cacheline bouncing really hurt. So that right there gets you to the basic design where you've got a global freelist and percpu freelists, and you just use the percpu freelists to batch up allocation/freeing to the global freelist. The tag allocator I was going to submit for the aio code was pretty much just that, nothing more. It was simple. It worked. I was happy with it. The one concern with this approach is what happens when all the percpu freelists except your are full of free tags. Originally, I had an easy solution - we calculate the size of the percpu freelists based on nr_tags and num_possible_cpus, so that there can't be more than half of the tag space stranded like this. (Keep in mind the main use case is drivers where the tag/id is used to talk to the device, so you're limited by whatever the hardware people thought was enough - 16 bits if you're lucky). But then Tejun went and pointed out, just as I was about to mail it off - Hey, what happens if you've got 4096 cpus and not all that many tags? Youv'e got a nice divice by zero in there. After which I proceeded to swear at him a bit, but - well, it's a real problem. And that is what led to the tag stealing stuff and all the cmpxchg() shenanigans. And I'm pretty happy with the solution - there's an elegance to it and I bet if I cared I could come up with a proof that it's more or less optimal w.r.t. cacheline bounces for some decent fraction of workloads we might care about. But yeah, it's not as simple as I would've liked. Anyways, now you've got an ida/idr api cleanup/rewrite to go along with it, and it's all nicely integrated. Integrating the percpu tag allocator with regular ida really doesn't save us any code - the original global freelist was a stack and like 10 lines of code total. But having the apis be consistent and having it all be organized and pretty is nice. I think it is now, anyways :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 03/10] idr: Rewrite ida
On Wed, Jun 19, 2013 at 10:40:22AM +0100, Steven Whitehouse wrote: Hi, On Tue, 2013-06-18 at 17:02 -0700, Kent Overstreet wrote: This is a new, from scratch implementation of ida that should be simpler, faster and more space efficient. [...] This does mean that the entire bitmap is stored in one contiguous memory allocation - and as currently implemented we won't be able to allocate _quite_ as many ids as with the previous implementation. I don't expect this to be an issue in practice since anywhere this is used, an id corresponds to a struct allocation somewher else - we can't allocate an unbounded number of ids, we'll run out of memory somewhere else eventually, and I expect that to be the limiting factor in practice. If a user/use case does come up where this matters I can add some sharding (or perhaps add a separate big_ida implementation) - but the extra complexity would adversely affect performance for the users that don't need millions of ids, so I intend to leave the implementation as is until if and when this becomes an issue. Millions of IDs is something that is fairly normal for DLM, since there will be two DLM locks per cached inode with GFS2 and people tend to use it on pretty large servers with lots of memory, Thanks, I wasn't aware of that. Is the 31 bits for the id limitation an issue for you? While I'm at changing ids to longs should be fairly trivial. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] Bcache fixes for 3.11
Hey Jens - I've been busy torture testing and chasing bugs, here's the fruits of my labors. These are all fairly small fixes, some of them quite important: The following changes since commit 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7: bcache: Use standard utility code (2013-07-01 14:43:53 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11 for you to fetch changes up to 79826c35eb99cd3c0873b8396f45fa26c87fb0b0: bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700) Dan Carpenter (1): bcache: check for allocation failures Kent Overstreet (7): bcache: Fix a dumb race bcache: Advertise that flushes are supported bcache: Fix a sysfs splat on shutdown bcache: Shutdown fix bcache: Journal replay fix bcache: Fix GC_SECTORS_USED() calculation bcache: Allocation kthread fixes drivers/md/bcache/alloc.c | 18 -- drivers/md/bcache/bcache.h | 5 + drivers/md/bcache/btree.c | 4 +++- drivers/md/bcache/closure.c | 6 -- drivers/md/bcache/journal.c | 7 ++- drivers/md/bcache/request.c | 8 +++- drivers/md/bcache/super.c | 42 ++ drivers/md/bcache/sysfs.c | 2 ++ 8 files changed, 61 insertions(+), 31 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] Bcache changes for 3.12
Jens, here's the bcache stuff for 3.12. This is on top of the pull request I sent you of fixes for 3.11. Pull away! The following changes since commit bef0ef06a9fe34b08177b67d3213dceab29d3abe: bcache: Fix for handling overlapping extents when reading in a btree node (2013-09-03 13:42:35 -0700) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.12 for you to fetch changes up to 2879736e8a022780e28da9e4502815b510fcfe8b: bcache: Bypass torture test (2013-09-10 19:08:32 -0700) Kent Overstreet (49): bcache: Use blkdev_issue_discard() bcache: Add on error panic/unregister setting bcache: Stripe size isn't necessarily a power of two bcache: Remove unnecessary check in should_split() bcache: Explicitly track btree node's parent bcache: Add btree_insert_node() bcache: Insert multiple keys at a time bcache: Convert btree_insert_check_key() to btree_insert_node() bcache: Add explicit keylist arg to btree_insert() bcache: Clean up keylist code bcache: Refactor request_write() bcache: Refactor read request code a bit bcache: Refactor journalling flow control bcache: Move keylist out of btree_op bcache: Convert try_wait to wait_queue_head_t bcache: Convert bucket_wait to wait_queue_head_t bcache: Convert gc to a kthread bcache: Convert writeback to a kthread bcache: Add btree_map() functions bcache: Move some stuff to btree.c bcache: Convert bch_btree_read_async() to bch_btree_map_keys() bcache: Clean up cache_lookup_fn bcache: Prune struct btree_op bcache: Kill op-cl bcache: Drop some closure stuff bcache: Kill op-replace bcache: Don't use op-insert_collision bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes() bcache: Break up struct search bcache: Move sector allocator to alloc.c bcache: Pull on disk data structures out into a separate header bcache: Fix bch_ptr_bad() bcache: Debug code improvements bcache: Don't bother with bucket refcount for btree node allocations bcache: bch_(btree|extent)_ptr_invalid() bcache: PRECEDING_KEY() bcache: Add btree_node_write_sync() bcache: Add make_btree_freeing_key() bcache: Incremental gc bcache: Avoid deadlocking in garbage collection bcache: Kill bch_next_recurse_key() bcache: Kill sequential_merge option bcache: Move spinlock into struct time_stats bcache: Have btree_split() insert into parent directly bcache: Better full stripe scanning bcache: Fix sysfs splat on shutdown with flash only devs bcache: Use ida for bcache block dev minor bcache: Delete some slower inline asm bcache: Bypass torture test drivers/md/bcache/Kconfig | 11 +- drivers/md/bcache/alloc.c | 383 +++- drivers/md/bcache/bcache.h| 327 +- drivers/md/bcache/bset.c | 289 - drivers/md/bcache/bset.h | 93 +-- drivers/md/bcache/btree.c | 1385 ++--- drivers/md/bcache/btree.h | 195 ++ drivers/md/bcache/closure.c | 25 +- drivers/md/bcache/closure.h | 110 +--- drivers/md/bcache/debug.c | 170 +++-- drivers/md/bcache/debug.h | 50 +- drivers/md/bcache/journal.c | 290 - drivers/md/bcache/journal.h | 49 +- drivers/md/bcache/movinggc.c | 85 ++- drivers/md/bcache/request.c | 1106 drivers/md/bcache/request.h | 43 +- drivers/md/bcache/stats.c | 26 +- drivers/md/bcache/stats.h | 13 +- drivers/md/bcache/super.c | 188 +++--- drivers/md/bcache/sysfs.c | 42 +- drivers/md/bcache/trace.c |1 - drivers/md/bcache/util.c | 12 +- drivers/md/bcache/util.h | 15 +- drivers/md/bcache/writeback.c | 455 +++--- drivers/md/bcache/writeback.h | 47 +- include/trace/events/bcache.h | 47 +- include/uapi/linux/bcache.h | 373 +++ 27 files changed, 2966 insertions(+), 2864 deletions(-) create mode 100644 include/uapi/linux/bcache.h -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] bcache changes for 3.13
Hey Jens, sorry for being late with this - anyways, it's roughly the same set of patches you had queued up before plus a few minor fixes, but it's been rebased onto your for-3.13/core branch. The following changes since commit febca1baea1cfe2d7a0271385d89b03d5fb34f94: block: setup bi_vcnt on clones (2013-10-31 13:32:42 -0600) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.13 for you to fetch changes up to 0bb953dba5d295b03bb035bfc2d28ed4155df377: bcache: defensively handle format strings (2013-11-01 18:03:20 -0700) Kees Cook (1): bcache: defensively handle format strings Kent Overstreet (53): bcache: Fixed incorrect order of arguments to bio_alloc_bioset() bcache: Convert bch_data_verify() to immutable biovecs bcache: Fix a journalling performance bug bcache: Fix a lockdep splat bcache: Use blkdev_issue_discard() bcache: Add on error panic/unregister setting bcache: Stripe size isn't necessarily a power of two bcache: Remove unnecessary check in should_split() bcache: Explicitly track btree node's parent bcache: Add btree_insert_node() bcache: Insert multiple keys at a time bcache: Convert btree_insert_check_key() to btree_insert_node() bcache: Add explicit keylist arg to btree_insert() bcache: Clean up keylist code bcache: Refactor request_write() bcache: Refactor read request code a bit bcache: Refactor journalling flow control bcache: Move keylist out of btree_op bcache: Convert try_wait to wait_queue_head_t bcache: Convert bucket_wait to wait_queue_head_t bcache: Convert gc to a kthread bcache: Convert writeback to a kthread bcache: Add btree_map() functions bcache: Move some stuff to btree.c bcache: Convert bch_btree_read_async() to bch_btree_map_keys() bcache: Clean up cache_lookup_fn bcache: Prune struct btree_op bcache: Kill op-cl bcache: Drop some closure stuff bcache: Kill op-replace bcache: Don't use op-insert_collision bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes() bcache: Break up struct search bcache: Move sector allocator to alloc.c bcache: Pull on disk data structures out into a separate header bcache: Fix bch_ptr_bad() bcache: Debug code improvements bcache: Don't bother with bucket refcount for btree node allocations bcache: bch_(btree|extent)_ptr_invalid() bcache: PRECEDING_KEY() bcache: Add btree_node_write_sync() bcache: Add make_btree_freeing_key() bcache: Incremental gc bcache: Avoid deadlocking in garbage collection bcache: Kill bch_next_recurse_key() bcache: Kill sequential_merge option bcache: Move spinlock into struct time_stats bcache: Have btree_split() insert into parent directly bcache: Better full stripe scanning bcache: Fix sysfs splat on shutdown with flash only devs bcache: Use ida for bcache block dev minor bcache: Delete some slower inline asm bcache: Bypass torture test drivers/md/bcache/Kconfig | 11 +- drivers/md/bcache/alloc.c | 383 +++- drivers/md/bcache/bcache.h| 327 +- drivers/md/bcache/bset.c | 289 - drivers/md/bcache/bset.h | 93 +-- drivers/md/bcache/btree.c | 1386 ++--- drivers/md/bcache/btree.h | 195 ++ drivers/md/bcache/closure.c | 103 +-- drivers/md/bcache/closure.h | 183 +- drivers/md/bcache/debug.c | 202 +++--- drivers/md/bcache/debug.h | 50 +- drivers/md/bcache/journal.c | 293 + drivers/md/bcache/journal.h | 52 +- drivers/md/bcache/movinggc.c | 85 ++- drivers/md/bcache/request.c | 1106 drivers/md/bcache/request.h | 42 +- drivers/md/bcache/stats.c | 26 +- drivers/md/bcache/stats.h | 13 +- drivers/md/bcache/super.c | 190 +++--- drivers/md/bcache/sysfs.c | 42 +- drivers/md/bcache/trace.c |1 - drivers/md/bcache/util.c | 12 +- drivers/md/bcache/util.h | 15 +- drivers/md/bcache/writeback.c | 455 +++--- drivers/md/bcache/writeback.h | 46 +- include/trace/events/bcache.h | 47 +- include/uapi/linux/bcache.h | 373 +++ 27 files changed, 3006 insertions(+), 3014 deletions(-) create mode 100644 include/uapi/linux/bcache.h -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH for-next] dm: fix missing bi_remaining accounting
On Mon, Nov 04, 2013 at 10:06:00AM -0500, Mikulas Patocka wrote: On Fri, 1 Nov 2013, Jens Axboe wrote: On 11/01/2013 07:59 AM, Mike Snitzer wrote: Add the missing bi_remaining increment, required by the block layer's new bio-chaining code, to both the verity and old snapshot DM targets. Otherwise users will hit the bi_remaining = 0 BUG_ON in bio_endio(). Thanks Mike, added to the mix. -- Jens Axboe Hi This improves a little bit on the previous patch, by replacing costly atomic_inc with cheap atomic_set. IMO, this is a bad idea; the behaviour with this patch does _not_ match the naming of bio_endio_nodec(), and the performance difference should be well in the noise anyways because we're touching a cacheline we already have in cache and won't be contended. The fact that it's currently safe is accidental, I could see this easily tripping people up and being a pain in the ass to debug in the future. From: Mikulas Patocka mpato...@redhat.com dm: change atomic_inc to atomic_set(1) There are places in dm where we save bi_endio and bi_private, set them to target's routine, submit the bio, from the target's bi_endio routine we restore bi_endio and bi_private and end the bio with bi_endio. This causes underflow of bi_remaining, so we must restore bi_remaining before ending the bio from the target bi_endio routine. The code uses atomic_inc for restoration of bi_remaining. This patch changes it to atomic_set(1) to avoid an interlocked instruction. In the target's bi_endio routine we are sure that bi_remaining is zero (otherwise, the bi_endio routine wouldn't be called) and there are no concurrent users of the bio, so we can replace atomic_inc with atomic_set(1). -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [GIT PULL] bcache changes for 3.13
On Mon, Nov 04, 2013 at 01:12:17PM -0700, Jens Axboe wrote: On 11/04/2013 01:07 PM, Kent Overstreet wrote: Hey Jens, sorry for being late with this - anyways, it's roughly the same set of patches you had queued up before plus a few minor fixes, but it's been rebased onto your for-3.13/core branch. Merge window is a little later this time, and since it was already in for the previous release, we can make it happen. But can you please base it on for-3.13/drivers however? That's where I'd like to pull in the driver bits, and if I pull this one, then things get even more tangled since for-3.13/core is holding the blk-mq changes too. I'd like to do the merge with immutable biovecs myself though - there's some tricky bits in there. How will that work? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Block layer stuff/DIO rewrite prep for 3.14
Now that immutable biovecs is in, these are the remaining patches required for my DIO rewrite, along with some related cleanup/refactoring. The key enabler is patch 4 - making generic_make_request() handle arbitary sized bios. This takes what was once bio_add_page()'s responsibility and pushes it down; long term plan is to hopefully push this down to the driver level, which should simplify a lot of code (a lot of it very fragile code!) and improve performance at the same time. The DIO rewrite needs some more work before it'll be ready, but I wanted to get this patch series out because this stuff should all be ready and it's useful for other reasons - I think this stuff will make it a lot easier for the btrfs people to do what they need with their DIO code. Other stuff enabled by this: * With this and the bio_split() rewrite already in, we can just delete merge_bvec_fn. I have patches for this, but they'll need more testing. * Multipage bvecs - bvecs pointing to an arbitrary amount of contiguous physical memory. I have this working but it'll need more testing and code auditing - this is what lets us kill bi_seg_front_size and bi_seg_back_size though. Patch series is based on Jens' for-next tree, and it's available in my git repository - git://evilpiepirate.org/~kent/linux-bcache.git for-jens -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/9] block: submit_bio_wait() conversions
It was being open coded in a few places. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Joern Engel jo...@logfs.org Cc: Prasad Joshi prasadjoshi.li...@gmail.com --- block/blk-flush.c | 19 +-- fs/logfs/dev_bdev.c | 8 +--- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 5580b05..9288aaf 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, bio-bi_flags); - if (bio-bi_private) - complete(bio-bi_private); - bio_put(bio); -} - /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for @@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err) int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret = 0; @@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio-bi_end_io = bio_end_flush; bio-bi_bdev = bdev; - bio-bi_private = wait; - bio_get(bio); - submit_bio(WRITE_FLUSH, bio); - wait_for_completion_io(wait); + ret = submit_bio_wait(WRITE_FLUSH, bio); /* * The driver must store the error location in -bi_sector, if @@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (error_sector) *error_sector = bio-bi_iter.bi_sector; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); return ret; } diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index ca42715..80adce7 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -23,7 +23,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) { struct bio bio; struct bio_vec bio_vec; - struct completion complete; bio_init(bio); bio.bi_max_vecs = 1; @@ -35,13 +34,8 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio.bi_iter.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_iter.bi_sector = page-index * (PAGE_SIZE 9); - init_completion(complete); - bio.bi_private = complete; - bio.bi_end_io = request_complete; - submit_bio(rw, bio); - wait_for_completion(complete); - return test_bit(BIO_UPTODATE, bio.bi_flags) ? 0 : -EIO; + return submit_bio_wait(rw, bio); } static int bdev_readpage(void *_sb, struct page *page) -- 1.8.4.rc3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 9/9] block: Add bio_get_user_pages()
This replaces some of the code that was in __bio_map_user_iov(), and soon we're going to use this helper in the dio code. Note that this relies on the recent change to make generic_make_request() take arbitrary sized bios - we're not using bio_add_page() here. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- fs/bio.c| 124 +++- include/linux/bio.h | 2 + 2 files changed, 67 insertions(+), 59 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index c60bfcb..a10b350 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1124,17 +1124,70 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, } EXPORT_SYMBOL(bio_copy_user); +/** + * bio_get_user_pages - pin user pages and add them to a biovec + * @bio: bio to add pages to + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * + * Pins pages for up to @len bytes and appends them to @bio's bvec array. May + * pin only part of the requested pages - @bio need not have room for all the + * pages and can already have had pages added to it. + * + * Returns the number of bytes from @len added to @bio. + */ +ssize_t bio_get_user_pages(struct bio *bio, unsigned long uaddr, + unsigned long len, int write_to_vm) +{ + int ret; + unsigned nr_pages, bytes; + unsigned offset = offset_in_page(uaddr); + struct bio_vec *bv; + struct page **pages; + + nr_pages = min_t(size_t, +DIV_ROUND_UP(len + offset, PAGE_SIZE), +bio-bi_max_vecs - bio-bi_vcnt); + + bv = bio-bi_io_vec[bio-bi_vcnt]; + pages = (void *) bv; + + ret = get_user_pages_fast(uaddr, nr_pages, write_to_vm, pages); + if (ret 0) + return ret; + + bio-bi_vcnt += ret; + bytes = ret * PAGE_SIZE - offset; + + while (ret--) { + bv[ret].bv_page = pages[ret]; + bv[ret].bv_len = PAGE_SIZE; + bv[ret].bv_offset = 0; + } + + bv[0].bv_offset += offset; + bv[0].bv_len -= offset; + + if (bytes len) { + bio-bi_io_vec[bio-bi_vcnt - 1].bv_len -= bytes - len; + bytes = len; + } + + bio-bi_iter.bi_size += bytes; + + return bytes; +} +EXPORT_SYMBOL(bio_get_user_pages); + static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { - int i, j; - int nr_pages = 0; - struct page **pages; + ssize_t ret; + int i, nr_pages = 0; struct bio *bio; - int cur_page = 0; - int ret, offset; for (i = 0; i iov_count; i++) { unsigned long uaddr = (unsigned long)iov[i].iov_base; @@ -1163,57 +1216,17 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!bio) return ERR_PTR(-ENOMEM); - ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); - if (!pages) - goto out; - for (i = 0; i iov_count; i++) { - unsigned long uaddr = (unsigned long)iov[i].iov_base; - unsigned long len = iov[i].iov_len; - unsigned long end = (uaddr + len + PAGE_SIZE - 1) PAGE_SHIFT; - unsigned long start = uaddr PAGE_SHIFT; - const int local_nr_pages = end - start; - const int page_limit = cur_page + local_nr_pages; - - ret = get_user_pages_fast(uaddr, local_nr_pages, - write_to_vm, pages[cur_page]); - if (ret local_nr_pages) { - ret = -EFAULT; - goto out_unmap; - } - - offset = uaddr ~PAGE_MASK; - for (j = cur_page; j page_limit; j++) { - unsigned int bytes = PAGE_SIZE - offset; + ret = bio_get_user_pages(bio, (size_t) iov[i].iov_base, +iov[i].iov_len, +write_to_vm); + if (ret 0) + goto out; - if (len = 0) - break; - - if (bytes len) - bytes = len; - - /* -* sorry... -*/ - if (bio_add_pc_page(q, bio, pages[j], bytes, offset) - bytes) - break; - - len -= bytes; - offset = 0; - } - - cur_page = j
[PATCH 8/9] bcache: generic_make_request() handles large bios now
So we get to delete our hacky workaround. Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/md/bcache/bcache.h| 18 drivers/md/bcache/io.c| 100 +- drivers/md/bcache/journal.c | 4 +- drivers/md/bcache/request.c | 16 +++ drivers/md/bcache/super.c | 33 ++ drivers/md/bcache/util.h | 5 ++- drivers/md/bcache/writeback.c | 4 +- include/linux/bio.h | 12 - 8 files changed, 19 insertions(+), 173 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 964353c..8f65331 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -241,19 +241,6 @@ struct keybuf { DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; -struct bio_split_pool { - struct bio_set *bio_split; - mempool_t *bio_split_hook; -}; - -struct bio_split_hook { - struct closure cl; - struct bio_split_pool *p; - struct bio *bio; - bio_end_io_t*bi_end_io; - void*bi_private; -}; - struct bcache_device { struct closure cl; @@ -286,8 +273,6 @@ struct bcache_device { int (*cache_miss)(struct btree *, struct search *, struct bio *, unsigned); int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); - - struct bio_split_pool bio_split_hook; }; struct io { @@ -465,8 +450,6 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; - - struct bio_split_pool bio_split_hook; }; struct gc_stat { @@ -901,7 +884,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); -void bch_generic_make_request(struct bio *, struct bio_split_pool *); void __bch_submit_bbio(struct bio *, struct cache_set *); void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fa028fa..86a0bb8 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -11,104 +11,6 @@ #include linux/blkdev.h -static unsigned bch_bio_max_sectors(struct bio *bio) -{ - struct request_queue *q = bdev_get_queue(bio-bi_bdev); - struct bio_vec bv; - struct bvec_iter iter; - unsigned ret = 0, seg = 0; - - if (bio-bi_rw REQ_DISCARD) - return min(bio_sectors(bio), q-limits.max_discard_sectors); - - bio_for_each_segment(bv, bio, iter) { - struct bvec_merge_data bvm = { - .bi_bdev= bio-bi_bdev, - .bi_sector = bio-bi_iter.bi_sector, - .bi_size= ret 9, - .bi_rw = bio-bi_rw, - }; - - if (seg == min_t(unsigned, BIO_MAX_PAGES, -queue_max_segments(q))) - break; - - if (q-merge_bvec_fn - q-merge_bvec_fn(q, bvm, bv) (int) bv.bv_len) - break; - - seg++; - ret += bv.bv_len 9; - } - - ret = min(ret, queue_max_sectors(q)); - - WARN_ON(!ret); - ret = max_t(int, ret, bio_iovec(bio).bv_len 9); - - return ret; -} - -static void bch_bio_submit_split_done(struct closure *cl) -{ - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - s-bio-bi_end_io = s-bi_end_io; - s-bio-bi_private = s-bi_private; - bio_endio_nodec(s-bio, 0); - - closure_debug_destroy(s-cl); - mempool_free(s, s-p-bio_split_hook); -} - -static void bch_bio_submit_split_endio(struct bio *bio, int error) -{ - struct closure *cl = bio-bi_private; - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - if (error) - clear_bit(BIO_UPTODATE, s-bio-bi_flags); - - bio_put(bio); - closure_put(cl); -} - -void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) -{ - struct bio_split_hook *s; - struct bio *n; - - if (!bio_has_data(bio) !(bio-bi_rw REQ_DISCARD)) - goto submit; - - if (bio_sectors(bio) = bch_bio_max_sectors(bio)) - goto submit; - - s = mempool_alloc(p-bio_split_hook, GFP_NOIO); - closure_init(s-cl, NULL); - - s-bio = bio; - s-p= p; - s-bi_end_io= bio-bi_end_io; - s-bi_private = bio-bi_private; - bio_get(bio); - - do { - n = bio_next_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s-p-bio_split
[PATCH 7/9] blk-lib.c: generic_make_request() handles large bios now
generic_make_request() will now do for us what the code in blk-lib.c was doing manually, with the bio_batch stuff - we still need some looping in case we're trying to discard/zeroout more than around a gigabyte, but when we can submit that much at a time doing the submissions in parallel really shouldn't matter. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- block/blk-lib.c | 175 ++-- 1 file changed, 30 insertions(+), 145 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 2da76c9..368c36a 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,23 +9,6 @@ #include blk.h -struct bio_batch { - atomic_tdone; - unsigned long flags; - struct completion *wait; -}; - -static void bio_batch_end_io(struct bio *bio, int err) -{ - struct bio_batch *bb = bio-bi_private; - - if (err (err != -EOPNOTSUPP)) - clear_bit(BIO_UPTODATE, bb-flags); - if (atomic_dec_and_test(bb-done)) - complete(bb-wait); - bio_put(bio); -} - /** * blkdev_issue_discard - queue a discard * @bdev: blockdev to issue discard for @@ -40,15 +23,10 @@ static void bio_batch_end_io(struct bio *bio, int err) int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; - int alignment; - struct bio_batch bb; struct bio *bio; int ret = 0; - struct blk_plug plug; if (!q) return -ENXIO; @@ -56,78 +34,28 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!blk_queue_discard(q)) return -EOPNOTSUPP; - /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q-limits.discard_granularity 9, 1U); - alignment = (bdev_discard_alignment(bdev) 9) % granularity; - - /* -* Ensure that max_discard_sectors is of the proper -* granularity, so that requests stay aligned after a split. -*/ - max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; type |= REQ_SECURE; } - atomic_set(bb.done, 1); - bb.flags = 1 BIO_UPTODATE; - bb.wait = wait; - - blk_start_plug(plug); while (nr_sects) { - unsigned int req_sects; - sector_t end_sect, tmp; - bio = bio_alloc(gfp_mask, 1); - if (!bio) { - ret = -ENOMEM; - break; - } - - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); - - /* -* If splitting a request, and the next starting sector would be -* misaligned, stop the discard at the previous aligned sector. -*/ - end_sect = sector + req_sects; - tmp = end_sect; - if (req_sects nr_sects - sector_div(tmp, granularity) != alignment) { - end_sect = end_sect - alignment; - sector_div(end_sect, granularity); - end_sect = end_sect * granularity + alignment; - req_sects = end_sect - sector; - } + if (!bio) + return -ENOMEM; - bio-bi_iter.bi_sector = sector; - bio-bi_end_io = bio_batch_end_io; bio-bi_bdev = bdev; - bio-bi_private = bb; + bio-bi_iter.bi_sector = sector; + bio-bi_iter.bi_size = min_t(sector_t, nr_sects, 1 20) 9; - bio-bi_iter.bi_size = req_sects 9; - nr_sects -= req_sects; - sector = end_sect; + sector += bio_sectors(bio); + nr_sects -= bio_sectors(bio); - atomic_inc(bb.done); - submit_bio(type, bio); + ret = submit_bio_wait(type, bio); + if (ret) + break; } - blk_finish_plug(plug); - - /* Wait for bios in-flight */ - if (!atomic_dec_and_test(bb.done)) - wait_for_completion_io(wait); - - if (!test_bit(BIO_UPTODATE, bb.flags)) - ret = -EIO; return
[PATCH 6/9] mtip32xx: handle arbitrary size bios
We get a measurable performance increase by handling this in the driver when we're already looping over the biovec, instead of handling it separately in generic_make_request() (or bio_add_page() originally) Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/block/mtip32xx/mtip32xx.c | 46 +-- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index d4c669b..c5a7a96 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2648,24 +2648,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, } /* - * Release a command slot. - * - * @dd Pointer to the driver data structure. - * @tag Slot tag - * - * return value - * None - */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, - int unaligned) -{ - struct semaphore *sem = unaligned ? dd-port-cmd_slot_unal : - dd-port-cmd_slot; - release_slot(dd-port, tag); - up(sem); -} - -/* * Obtain a command slot and return its associated scatter list. * * @dd Pointer to the driver data structure. @@ -4016,21 +3998,22 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) sg = mtip_hw_get_scatterlist(dd, tag, unaligned); if (likely(sg != NULL)) { - if (unlikely((bio)-bi_vcnt MTIP_MAX_SG)) { - dev_warn(dd-pdev-dev, - Maximum number of SGL entries exceeded\n); - bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag, unaligned); - return; - } - /* Create the scatter list for this bio. */ bio_for_each_segment(bvec, bio, iter) { - sg_set_page(sg[nents], - bvec.bv_page, - bvec.bv_len, - bvec.bv_offset); - nents++; + if (unlikely(nents == MTIP_MAX_SG)) { + struct bio *split = bio_clone(bio, GFP_NOIO); + + split-bi_iter = iter; + bio-bi_iter.bi_size -= iter.bi_size; + bio_chain(split, bio); + generic_make_request(split); + break; + } + + sg_set_page(sg[nents++], + bvec.bv_page, + bvec.bv_len, + bvec.bv_offset); } /* Issue the read/write. */ @@ -4145,6 +4128,7 @@ skip_create_disk: blk_queue_max_hw_sectors(dd-queue, 0x); blk_queue_max_segment_size(dd-queue, 0x40); blk_queue_io_min(dd-queue, 4096); + set_bit(QUEUE_FLAG_LARGEBIOS, dd-queue-queue_flags); /* * write back cache is not supported in the device. FUA depends on -- 1.8.4.rc3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/9] block: Convert various code to bio_for_each_segment()
With immutable biovecs we don't want code accessing bi_io_vec directly - the uses this patch changes weren't incorrect since they all own the bio, but it makes the code harder to audit for no good reason - also, this will help with multipage bvecs later. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Alexander Viro v...@zeniv.linux.org.uk Cc: Chris Mason chris.ma...@fusionio.com Cc: Jaegeuk Kim jaegeuk@samsung.com Cc: Joern Engel jo...@logfs.org Cc: Prasad Joshi prasadjoshi.li...@gmail.com Cc: Trond Myklebust trond.mykleb...@netapp.com --- fs/btrfs/compression.c | 10 -- fs/btrfs/disk-io.c | 11 --- fs/btrfs/extent_io.c | 37 ++--- fs/btrfs/inode.c | 15 ++- fs/f2fs/data.c | 13 + fs/f2fs/segment.c| 12 +--- fs/logfs/dev_bdev.c | 18 +++--- fs/mpage.c | 17 - fs/nfs/blocklayout/blocklayout.c | 34 +- 9 files changed, 66 insertions(+), 101 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 06ab821..52e7848 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -203,18 +203,16 @@ csum_failed: if (cb-errors) { bio_io_error(cb-orig_bio); } else { - int bio_index = 0; - struct bio_vec *bvec = cb-orig_bio-bi_io_vec; + int i; + struct bio_vec *bvec; /* * we have verified the checksum already, set page * checked so the end_io handlers know about it */ - while (bio_index cb-orig_bio-bi_vcnt) { + bio_for_each_segment_all(bvec, cb-orig_bio, i) SetPageChecked(bvec-bv_page); - bvec++; - bio_index++; - } + bio_endio(cb-orig_bio, 0); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62176ad..733182e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -850,20 +850,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, static int btree_csum_one_bio(struct bio *bio) { - struct bio_vec *bvec = bio-bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; struct btrfs_root *root; - int ret = 0; + int i, ret = 0; - WARN_ON(bio-bi_vcnt = 0); - while (bio_index bio-bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec-bv_page-mapping-host)-root; ret = csum_dirty_buffer(root, bvec-bv_page); if (ret) break; - bio_index++; - bvec++; } + return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0df176a..ea5a08b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2014,7 +2014,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } bio-bi_bdev = dev-bdev; bio_add_page(bio, page, length, start - page_offset(page)); - btrfsic_submit_bio(WRITE_SYNC, bio); + btrfsic_submit_bio(WRITE_SYNC, bio); /* XXX: submit_bio_wait() */ wait_for_completion(compl); if (!test_bit(BIO_UPTODATE, bio-bi_flags)) { @@ -2340,12 +2340,13 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio, int err) { - struct bio_vec *bvec = bio-bi_io_vec + bio-bi_vcnt - 1; + struct bio_vec *bvec; struct extent_io_tree *tree; u64 start; u64 end; + int i; - do { + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec-bv_page; tree = BTRFS_I(page-mapping-host)-io_tree; @@ -2363,14 +2364,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) start = page_offset(page); end = start + bvec-bv_offset + bvec-bv_len - 1; - if (--bvec = bio-bi_io_vec) - prefetchw(bvec-bv_page-flags); - if (end_extent_writepage(page, err, start, end)) continue; end_page_writeback(page); - } while (bvec = bio-bi_io_vec); + } bio_put(bio); } @@ -2400,9 +2398,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, */ static void end_bio_extent_readpage(struct bio *bio, int err) { + struct bio_vec *bvec; int uptodate = test_bit(BIO_UPTODATE, bio-bi_flags); - struct bio_vec *bvec_end = bio-bi_io_vec + bio-bi_vcnt - 1; - struct bio_vec *bvec = bio-bi_io_vec; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct
[PATCH 3/9] block: Move bouncing to generic_make_request()
Next patch is going to make generic_make_request() handle arbitrary sized bios by splitting them if necessary. It makes more sense to call blk_queue_bounce() first, partly so it's working on larger bios - but also the code that splits bios, and __blk_recalc_rq_segments(), won't have to take into account bouncing (as it'll already have been done). Also, __blk_recalc_rq_segments() now doesn't have to take into account potential bouncing - it's already been done. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Jiri Kosina jkos...@suse.cz Cc: Asai Thambi S P asamymuth...@micron.com --- block/blk-core.c | 14 +++--- block/blk-merge.c | 13 - drivers/block/mtip32xx/mtip32xx.c | 2 -- drivers/block/pktcdvd.c | 2 -- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d9cab97..3c7467e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1466,13 +1466,6 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; - /* -* low level driver can indicate that it wants pages above a -* certain limit bounced to low memory (ie for highmem, or even -* ISA dma in theory) -*/ - blk_queue_bounce(q, bio); - if (bio_integrity_enabled(bio) bio_integrity_prep(bio)) { bio_endio(bio, -EIO); return; @@ -1822,6 +1815,13 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio-bi_bdev); + /* +* low level driver can indicate that it wants pages above a +* certain limit bounced to low memory (ie for highmem, or even +* ISA dma in theory) +*/ + blk_queue_bounce(q, bio); + q-make_request_fn(q, bio); bio = bio_list_pop(current-bio_list); diff --git a/block/blk-merge.c b/block/blk-merge.c index 953b8df..9680ec73 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1; + int cluster, prev = 0; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -27,13 +27,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, nr_phys_segs = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { - /* -* the trick here is making sure that a high page is -* never considered part of another segment, since that -* might change with the bounce page. -*/ - high = page_to_pfn(bv.bv_page) queue_bounce_pfn(q); - if (!high !highprv cluster) { + if (prev cluster) { if (seg_size + bv.bv_len queue_max_segment_size(q)) goto new_segment; @@ -44,6 +38,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, seg_size += bv.bv_len; bvprv = bv; + prev = 1; continue; } new_segment: @@ -53,8 +48,8 @@ new_segment: nr_phys_segs++; bvprv = bv; + prev = 1; seg_size = bv.bv_len; - highprv = high; } bbio = bio; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 52b2f2a..d4c669b 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4016,8 +4016,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) sg = mtip_hw_get_scatterlist(dd, tag, unaligned); if (likely(sg != NULL)) { - blk_queue_bounce(queue, bio); - if (unlikely((bio)-bi_vcnt MTIP_MAX_SG)) { dev_warn(dd-pdev-dev, Maximum number of SGL entries exceeded\n); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 1bf1f22..7991cc8 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2486,8 +2486,6 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) goto end_io; } - blk_queue_bounce(q, bio); - do { sector_t zone = get_zone(bio-bi_iter.bi_sector
[PATCH 4/9] block: Make generic_make_request handle arbitrary sized bios
The way the block layer is currently written, it goes to great lengths to avoid having to split bios; upper layer code (such as bio_add_page()) checks what the underlying device can handle and tries to always create bios that don't need to be split. But this approach becomes unwieldy and eventually breaks down with stacked devices and devices with dynamic limits, and it adds a lot of complexity. If the block layer could split bios as needed, we could eliminate a lot of complexity elsewhere - particularly in stacked drivers. Code that creates bios can then create whatever size bios are convenient, and more importantly stacked drivers don't have to deal with both their own bio size limitations and the limitations of the (potentially multiple) devices underneath them. In the future this will let us delete merge_bvec_fn and a bunch of other code. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Neil Brown ne...@suse.de Cc: Alasdair Kergon a...@redhat.com Cc: dm-de...@redhat.com --- block/blk-core.c | 26 ++- block/blk-merge.c | 120 + block/blk.h| 3 ++ include/linux/blkdev.h | 4 ++ 4 files changed, 143 insertions(+), 10 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3c7467e..abc5d23 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -566,6 +566,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q-id 0) goto fail_c; + q-bio_split = bioset_create(4, 0); + if (!q-bio_split) + goto fail_id; + q-backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q-backing_dev_info.state = 0; @@ -575,7 +579,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) err = bdi_init(q-backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(q-backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -620,6 +624,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_bdi: bdi_destroy(q-backing_dev_info); +fail_split: + bioset_free(q-bio_split); fail_id: ida_simple_remove(blk_queue_ida, q-id); fail_c: @@ -1681,15 +1687,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) - nr_sectors queue_max_hw_sectors(q))) { - printk(KERN_ERR bio too big device %s (%u %u)\n, - bdevname(bio-bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio-bi_bdev-bd_part; if (should_fail_request(part, bio-bi_iter.bi_size) || should_fail_request(part_to_disk(part)-part0, @@ -1814,6 +1811,7 @@ void generic_make_request(struct bio *bio) current-bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio-bi_bdev); + struct bio *split = NULL; /* * low level driver can indicate that it wants pages above a @@ -1822,6 +1820,14 @@ void generic_make_request(struct bio *bio) */ blk_queue_bounce(q, bio); + if (!blk_queue_largebios(q)) + split = blk_bio_segment_split(q, bio, q-bio_split); + if (split) { + bio_chain(split, bio); + bio_list_add(current-bio_list, bio); + bio = split; + } + q-make_request_fn(q, bio); bio = bio_list_pop(current-bio_list); diff --git a/block/blk-merge.c b/block/blk-merge.c index 9680ec73..6e07213 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,6 +9,126 @@ #include blk.h +static struct bio *blk_bio_discard_split(struct request_queue *q, +struct bio *bio, +struct bio_set *bs) +{ + unsigned int max_discard_sectors, granularity; + int alignment; + sector_t tmp; + unsigned split_sectors; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q-limits.discard_granularity 9, 1U); + + max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX 9); + max_discard_sectors -= max_discard_sectors % granularity; + + if (unlikely(!max_discard_sectors)) { + /* XXX: warn */ + return NULL; + } + + if (bio_sectors(bio) = max_discard_sectors) + return NULL; + + split_sectors = max_discard_sectors; + + /* +* If the next starting sector would be misaligned, stop
[PATCH 5/9] block: Gut bio_add_page(), kill bio_add_pc_page()
Since generic_make_request() can now handle arbitrary size bios, all we have to do is make sure the bvec array doesn't overflow. Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/scsi/osd/osd_initiator.c | 5 +- drivers/target/target_core_pscsi.c | 5 +- fs/bio.c | 158 +++-- fs/exofs/ore.c | 8 +- fs/exofs/ore_raid.c| 8 +- include/linux/bio.h| 2 - 6 files changed, 37 insertions(+), 149 deletions(-) diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index bac04c2..e52b30d 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1043,7 +1043,6 @@ EXPORT_SYMBOL(osd_req_read_sg); static struct bio *_create_sg_bios(struct osd_request *or, void **buff, const struct osd_sg_entry *sglist, unsigned numentries) { - struct request_queue *q = osd_request_queue(or-osd_dev); struct bio *bio; unsigned i; @@ -1060,9 +1059,9 @@ static struct bio *_create_sg_bios(struct osd_request *or, unsigned added_len; BUG_ON(offset + len PAGE_SIZE); - added_len = bio_add_pc_page(q, bio, page, len, offset); + added_len = bio_add_page(bio, page, len, offset); if (unlikely(len != added_len)) { - OSD_DEBUG(bio_add_pc_page len(%d) != added_len(%d)\n, + OSD_DEBUG(bio_add_page len(%d) != added_len(%d)\n, len, added_len); bio_put(bio); return ERR_PTR(-ENOMEM); diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 551c96c..d65d512 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -922,12 +922,11 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, tbio = tbio-bi_next = bio; } - pr_debug(PSCSI: Calling bio_add_pc_page() i: %d + pr_debug(PSCSI: Calling bio_add_page() i: %d bio: %p page: %p len: %d off: %d\n, i, bio, page, len, off); - rc = bio_add_pc_page(pdv-pdv_sd-request_queue, - bio, page, bytes, off); + rc = bio_add_page(bio, page, bytes, off); if (rc != bytes) goto fail; diff --git a/fs/bio.c b/fs/bio.c index 7d538a1..c60bfcb 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -665,12 +665,22 @@ int bio_get_nr_vecs(struct block_device *bdev) } EXPORT_SYMBOL(bio_get_nr_vecs); -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned short max_sectors) +/** + * bio_add_page- attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + */ +int bio_add_page(struct bio *bio, struct page *page, +unsigned int len, unsigned int offset) { - int retried_segments = 0; - struct bio_vec *bvec; + struct bio_vec *bv; /* * cloned bio must not modify vec list @@ -678,41 +688,17 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio-bi_iter.bi_size + len) 9) max_sectors) - return 0; - /* * For filesystems with a blocksize smaller than the pagesize * we will often be called with the same page as last time and * a consecutive offset. Optimize this special case. */ if (bio-bi_vcnt 0) { - struct bio_vec *prev = bio-bi_io_vec[bio-bi_vcnt - 1]; - - if (page == prev-bv_page - offset == prev-bv_offset + prev-bv_len) { - unsigned int prev_bv_len = prev-bv_len; - prev-bv_len += len; - - if (q-merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec
Re: [dm-devel] [PATCH 4/9] block: Make generic_make_request handle arbitrary sized bios
On Mon, Nov 04, 2013 at 03:56:52PM -0800, Mike Christie wrote: On 11/04/2013 03:36 PM, Kent Overstreet wrote: @@ -1822,6 +1820,14 @@ void generic_make_request(struct bio *bio) */ blk_queue_bounce(q, bio); + if (!blk_queue_largebios(q)) + split = blk_bio_segment_split(q, bio, q-bio_split); Is it assumed bios coming down this path are created using bio_add_page? If not, does blk_bio_segment_split need a queue_max_sectors or queue_max_hw_sectors check? I only saw a segment count check below. Shoot, you're absolutely right - thanks, I'll have this fixed in the next version. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: Revert bio_clone() default behaviour
This patch reverts the default behaviour introduced by 9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger shares the source bio's biovec, cloning the biovec is once again the default. Instead, we add a new bio_clone_biovec_fast(), which creates a clone that shares the source's biovec. This patch changes bcache and md to use __bio_clone_biovec_fast() since they're expecting the new behaviour due to other refactoring; most of the other uses of bio_clone() should be same to convert to the _fast() variant but that will be done more incrementally in other patches (bio_split() in particular). Note that __bio_clone() isn't being readded - the reason being that with immutable biovecs allocating the right number of biovecs for the new clone is no longer trivial so we don't want drivers trying to do that themselves. This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 - __bio_clone_fast() should not be setting bi_vcnt for bios that do not own the biovec (see Documentation/block/biovecs.txt for rationale) - in short, not setting it might cause bugs in the short term but long term it's likely to hide nastier more subtle bugs, we don't want code looking at bi_vcnt at all for bios it does not own. However, this patch _shouldn't_ cause any regressions because of this since we're reverting back to the old bio_clone() behaviour. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Chris Mason chris.ma...@fusionio.com Cc: Mike Snitzer snit...@redhat.com Cc: NeilBrown ne...@suse.de Cc: Olof Johansson o...@lixom.net --- Chris, Olaf, can you two in particular test this? I have tested the bounce buffer code (and bcache), but Jens told me today there was an md bug that I _still_ can't find any emails about so I'm not sure what to test for that. drivers/md/bcache/request.c | 6 +-- drivers/md/dm.c | 4 +- fs/bio.c| 104 +++- include/linux/bio.h | 6 +-- mm/bounce.c | 1 - 5 files changed, 72 insertions(+), 49 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 52a1fef..ef44198 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -681,7 +681,7 @@ static void do_bio_hook(struct search *s) struct bio *bio = s-bio.bio; bio_init(bio); - __bio_clone(bio, s-orig_bio); + __bio_clone_fast(bio, s-orig_bio); bio-bi_end_io = request_endio; bio-bi_private = s-cl; @@ -969,8 +969,8 @@ static void request_write(struct cached_dev *dc, struct search *s) trace_bcache_write(s-orig_bio, s-writeback, s-op.skip); if (!s-writeback) { - s-op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, - dc-disk.bio_split); + s-op.cache_bio = bio_clone_bioset_fast(bio, GFP_NOIO, + dc-disk.bio_split); closure_bio_submit(bio, cl, s-d); } else { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8e6174c..bafe7ed 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1135,7 +1135,7 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio, { struct bio *clone = tio-clone; - __bio_clone(clone, bio); + __bio_clone_fast(clone, bio); if (bio_integrity(bio)) bio_integrity_clone(clone, bio, GFP_NOIO); @@ -1177,7 +1177,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, * ci-bio-bi_max_vecs is BIO_INLINE_VECS anyway, for both flush * and discard, so no need for concern about wasted bvec allocations. */ -__bio_clone(clone, ci-bio); +__bio_clone_fast(clone, ci-bio); if (len) bio_setup_sector(clone, ci-sector, len); diff --git a/fs/bio.c b/fs/bio.c index 6046c91..99ff176 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -539,15 +539,17 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) EXPORT_SYMBOL(bio_phys_segments); /** - * __bio_clone - clone a bio + * __bio_clone_fast - clone a bio that shares the original bio's biovec * @bio: destination bio * @bio_src: bio to clone * * Clone a bio. Caller will own the returned bio, but not * the actual data it points to. Reference count of returned * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. */ -void __bio_clone(struct bio *bio, struct bio *bio_src) +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) { BUG_ON(bio-bi_pool BIO_POOL_IDX(bio) != BIO_POOL_NONE); @@ -560,20 +562,19 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) bio-bi_rw = bio_src-bi_rw; bio-bi_iter = bio_src-bi_iter; bio-bi_io_vec = bio_src-bi_io_vec; - bio-bi_vcnt = bio_src-bi_vcnt
Re: [PATCH] block: Revert bio_clone() default behaviour
On Tue, Nov 05, 2013 at 09:02:19PM -0800, Olof Johansson wrote: Hi, On Tue, Nov 5, 2013 at 7:48 PM, Kent Overstreet k...@daterainc.com wrote: Chris, Olaf, can you two in particular test this? I have tested the bounce buffer code (and bcache), but Jens told me today there was an md bug that I _still_ can't find any emails about so I'm not sure what to test for that. (I'm guessing you mean me and not some German person here. :-) What is this expected to apply on top of? It doesn't apply cleanly on the 20131105 -next tree which is where I had been seeing the issues. Sorry - it's on top of Jens' for-3.13/core branch, git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block: Revert bio_clone() default behaviour
On Wed, Nov 06, 2013 at 11:11:30AM -0500, Chris Mason wrote: Quoting Kent Overstreet (2013-11-05 22:48:41) This patch reverts the default behaviour introduced by 9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger shares the source bio's biovec, cloning the biovec is once again the default. Instead, we add a new bio_clone_biovec_fast(), which creates a clone that shares the source's biovec. This patch changes bcache and md to use ^ dm? __bio_clone_biovec_fast() since they're expecting the new behaviour due to other refactoring; most of the other uses of bio_clone() should be same to convert to the _fast() variant but that will be done more incrementally in other patches (bio_split() in particular). Hi Kent, I noticed yesterday the _fast variants of bio clone introduce sharing between the src and the clone, but without any reference counts: bio-bi_io_vec = bio_src-bi_io_vec; Have you audited all of the _fast users to make sure they are not freeing the src before the clone? Sorry if this came up already in past reviews. Yup - that should actually be safe for all the existing bio_clone() users actually, I audited all of them - because normally you're not going to complete the original bio until the clone finishes. Note that __bio_clone() isn't being readded - the reason being that with immutable biovecs allocating the right number of biovecs for the new clone is no longer trivial so we don't want drivers trying to do that themselves. This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 - __bio_clone_fast() should not be setting bi_vcnt for bios that do not own the biovec (see Documentation/block/biovecs.txt for rationale) - in short, I think I see what you mean with tying bi_vcnt to ownership of the bio, but we're not consistent. Looking at bio_for_eaach_segment_all: * * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ #define bio_for_each_segment_all(bvl, bio, i) \ for (i = 0, bvl = (bio)-bi_io_vec; i (bio)-bi_vcnt; i++, bvl++) bio_for_each_segment_all still trusts bi_vcnt, so any bio_for_each_segment_all operation on a clone will basically be a noop. Just looking at MD raid1 make_request() mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); ... alloc_behind_pages(mbio, r1_bio); - bio_for_each_segment_all ... if (r1_bio-behind_bvecs) { bio_for_each_segment_all(bvec, mbio, j) ... I didn't test MD without the vcnt fix, but I think any operations in MD that duplicate data for raid1 turn into noops. I think we'll end up writing garbage (or nothing) to the second mirror. If you look at dm's crypt_free_buffer_pages(), it had similar problems. Those are fine actually - in both cases, they're bios they allocated, not the bios that were submitted to them. Though md _definitely_ shouldn't have been sharing the original bio's biovec, so looks like this patch will fix a bug there... (I remember seeing that code before and I thought I added a bio_clone_biovec() call to that md code, but apparently that never got commited. Argh.) not setting it might cause bugs in the short term but long term it's likely to hide nastier more subtle bugs, we don't want code looking at bi_vcnt at all for bios it does not own. I think the concept of bio ownership is still much too weak, at least for established users like MD and DM. I don't know how to verify the sharing of bi_io_vec without some kind of reference counting on the iovec. What's unclear about it? The rule is just - if you didn't allocate the biovec, don't modify it or use bio_for_each_segment_all() (probably I didn't quite state it clearly enough before though) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: block: Revert bio_clone() default behaviour
On Wed, Nov 06, 2013 at 03:31:02PM -0500, Mike Snitzer wrote: Hey Kent, Digging a bit in the LKML archive I think this patch is in response to this thread: https://lkml.org/lkml/2013/11/6/27 That thread I saw, Jens told me there was another one though Might be good to give context for which reported problem(s) are being fixed by this patch. On Tue, Nov 05 2013 at 10:48pm -0500, Kent Overstreet k...@daterainc.com wrote: This patch reverts the default behaviour introduced by 9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger shares the source bio's biovec, cloning the biovec is once again the default. Your focus, in terms of revert, seems to be on restoring bio_clone_bioset, so: s/bio_clone_biovec/bio_clone_bioset/ Maybe best to say effectively reverts since you aren't reverting 9fc6286f347d00528adcdcf12396d220f47492ed ? Also s/clonger/longer/ typo ^ Instead, we add a new bio_clone_biovec_fast(), which creates a clone that shares the source's biovec. This patch changes bcache and md to use s/md/dm/ Whoops :p __bio_clone_biovec_fast() since they're expecting the new behaviour due to other refactoring; most of the other uses of bio_clone() should be same to convert to the _fast() variant but that will be done more s/same/safe/ Thanks incrementally in other patches (bio_split() in particular). Note that __bio_clone() isn't being readded - the reason being that with immutable biovecs allocating the right number of biovecs for the new clone is no longer trivial so we don't want drivers trying to do that themselves. This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 - __bio_clone_fast() should not be setting bi_vcnt for bios that do not own the biovec (see Documentation/block/biovecs.txt for rationale) - in short, not setting it might cause bugs in the short term but long term it's likely to hide nastier more subtle bugs, we don't want code looking at bi_vcnt at all for bios it does not own. However, this patch _shouldn't_ cause any regressions because of this since we're reverting back to the old bio_clone() behaviour. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Chris Mason chris.ma...@fusionio.com Cc: Mike Snitzer snit...@redhat.com Cc: NeilBrown ne...@suse.de Cc: Olof Johansson o...@lixom.net --- Chris, Olaf, can you two in particular test this? I have tested the bounce buffer code (and bcache), but Jens told me today there was an md bug that I _still_ can't find any emails about so I'm not sure what to test for that. /me assumes you really mean md here, given Chris's later reply in this thread. Relative to DM, this patch looks fine to me: Acked-by: Mike Snitzer snit...@redhat.com Thanks! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block: Revert bio_clone() default behaviour
On Wed, Nov 06, 2013 at 03:22:36PM -0500, Chris Mason wrote: Quoting Kent Overstreet (2013-11-06 15:02:22) Yup - that should actually be safe for all the existing bio_clone() users actually, I audited all of them - because normally you're not going to complete the original bio until the clone finishes. I'd say we need an ack from Neil before we can switch to _fast. The completions look non-trivial and _fast adds new ordering requirements on free. Agreed, I've already broken md enough lately :P (I'm started to work on a real test suite for the block layer, that'll help with this stuff...) If you look at dm's crypt_free_buffer_pages(), it had similar problems. Those are fine actually - in both cases, they're bios they allocated, not the bios that were submitted to them. Both are expecting bi_vcnt to be non-zero after a clone, which wasn't the case before the bi_vcnt patch. With your current patch (adding _fast) both should now be correct. Yeah, but bi_vcnt being zero was just a symptom - the underlying reason the old code was wrong was that with the clone sharing the parent's biovec, md was modifying a biovec (including bv_page!) that it didn't own - _that_ was what was horribly broken about it. What's unclear about it? The rule is just - if you didn't allocate the biovec, don't modify it or use bio_for_each_segment_all() (probably I didn't quite state it clearly enough before though) That part makes sense. The new rule that scares me is that we can't free the src of the clone until all the clones are freed. If it works with today's existing users it feels like it is more by accident than design. I'm not saying we can't do it, we just need some bigger flashing warning lights. Yeah, Jens was saying the same thing yesterday, hence this patch. OTOH - with regards to just the ordering requirements, the more I look at various code the less accidental the fact that that works seems to be: the best explanation I've come up with so far is that you already needed to ensure that the _pages_ the clone points to stick around until the clone completes, and if you don't own the original bio the only way to do that is to not complete the original bio until after the clone completes. So if you're a driver cloning bios that were submitted to you, bio_clone_fast() introduces no new ordering requirements. On the third hand - if you're cloning (i.e. splitting) your own bios, in that case it would be possible to screw up the ordering - I don't know of any code in the kernel that does this today (except for, sort of, bcache) but my dio rewrite takes this approach - but if you do the obvious and sane thing with bio_chain() it's a non issue, it seems to me you'd have to come up with something pretty contrived and dumb for this to actually be an issue in practice. Anyways, I haven't come to any definite conclusions, those are just my observations so far. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block/s390: Fixes for Convert bio_for_each_segment() to bvec_iter
5d1f127c3e0c57d64ce75ee04a0db2b40a3e21df was incomplete - this just converts s390's drivers. Signed-off-by: Kent Overstreet k...@daterainc.com --- I was poking around a bit, and it looks like there's nothing really preventing these drivers from building on non s390 except that a bunch of headers are in arch/s390/include. Would be nice to get that fixed so we can get better build coverage. drivers/s390/block/dasd_diag.c | 10 drivers/s390/block/dasd_eckd.c | 48 ++-- drivers/s390/block/dasd_fba.c| 26 +-- drivers/s390/block/scm_blk.c | 8 +++--- drivers/s390/block/scm_blk_cluster.c | 4 +-- 5 files changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 92bd22c..9cbc567 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -504,7 +504,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, struct dasd_diag_req *dreq; struct dasd_diag_bio *dbio; struct req_iterator iter; - struct bio_vec *bv; + struct bio_vec bv; char *dst; unsigned int count, datasize; sector_t recid, first_rec, last_rec; @@ -525,10 +525,10 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, /* Check struct bio and count the number of blocks for the request. */ count = 0; rq_for_each_segment(bv, req, iter) { - if (bv-bv_len (blksize - 1)) + if (bv.bv_len (blksize - 1)) /* Fba can only do full blocks. */ return ERR_PTR(-EINVAL); - count += bv-bv_len (block-s2b_shift + 9); + count += bv.bv_len (block-s2b_shift + 9); } /* Paranoia. */ if (count != last_rec - first_rec + 1) @@ -545,8 +545,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, dbio = dreq-bio; recid = first_rec; rq_for_each_segment(bv, req, iter) { - dst = page_address(bv-bv_page) + bv-bv_offset; - for (off = 0; off bv-bv_len; off += blksize) { + dst = page_address(bv.bv_page) + bv.bv_offset; + for (off = 0; off bv.bv_len; off += blksize) { memset(dbio, 0, sizeof (struct dasd_diag_bio)); dbio-type = rw_cmd; dbio-block_number = recid + 1; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 5adb204..030d6e0 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2507,7 +2507,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( struct dasd_ccw_req *cqr; struct ccw1 *ccw; struct req_iterator iter; - struct bio_vec *bv; + struct bio_vec bv; char *dst; unsigned int off; int count, cidaw, cplength, datasize; @@ -2529,13 +2529,13 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( count = 0; cidaw = 0; rq_for_each_segment(bv, req, iter) { - if (bv-bv_len (blksize - 1)) + if (bv.bv_len (blksize - 1)) /* Eckd can only do full blocks. */ return ERR_PTR(-EINVAL); - count += bv-bv_len (block-s2b_shift + 9); + count += bv.bv_len (block-s2b_shift + 9); #if defined(CONFIG_64BIT) - if (idal_is_needed (page_address(bv-bv_page), bv-bv_len)) - cidaw += bv-bv_len (block-s2b_shift + 9); + if (idal_is_needed (page_address(bv.bv_page), bv.bv_len)) + cidaw += bv.bv_len (block-s2b_shift + 9); #endif } /* Paranoia. */ @@ -2606,16 +2606,16 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( last_rec - recid + 1, cmd, basedev, blksize); } rq_for_each_segment(bv, req, iter) { - dst = page_address(bv-bv_page) + bv-bv_offset; + dst = page_address(bv.bv_page) + bv.bv_offset; if (dasd_page_cache) { char *copy = kmem_cache_alloc(dasd_page_cache, GFP_DMA | __GFP_NOWARN); if (copy rq_data_dir(req) == WRITE) - memcpy(copy + bv-bv_offset, dst, bv-bv_len); + memcpy(copy + bv.bv_offset, dst, bv.bv_len); if (copy) - dst = copy + bv-bv_offset; + dst = copy + bv.bv_offset; } - for (off = 0; off bv-bv_len; off += blksize) { + for (off = 0; off bv.bv_len; off += blksize) { sector_t trkid = recid
Re: [PATCH] block: Revert bio_clone() default behaviour
On Wed, Nov 06, 2013 at 04:25:45PM -0500, Chris Mason wrote: Quoting Kent Overstreet (2013-11-06 15:57:34) On Wed, Nov 06, 2013 at 03:22:36PM -0500, Chris Mason wrote: Quoting Kent Overstreet (2013-11-06 15:02:22) [ ... nods, thanks! ... ] OTOH - with regards to just the ordering requirements, the more I look at various code the less accidental the fact that that works seems to be: the best explanation I've come up with so far is that you already needed to ensure that the _pages_ the clone points to stick around until the clone completes, and if you don't own the original bio the only way to do that is to not complete the original bio until after the clone completes. So if you're a driver cloning bios that were submitted to you, bio_clone_fast() introduces no new ordering requirements. On the third hand - if you're cloning (i.e. splitting) your own bios, in that case it would be possible to screw up the ordering - I don't know of any code in the kernel that does this today (except for, sort of, bcache) but my dio rewrite takes this approach - but if you do the obvious and sane thing with bio_chain() it's a non issue, it seems to me you'd have to come up with something pretty contrived and dumb for this to actually be an issue in practice. Anyways, I haven't come to any definite conclusions, those are just my observations so far. I do think you're right. We all seem to have clones doing work on behalf of the original, and when everyone is done we complete the original. But, btrfs does have silly things like this: dio_end_io(dio_bio, err); // end and free the original bio_put(bio); // free the clone It's not a bug yet, but given enough time the space between those two frees will grow new code that kills us all. Hopefully the DIO situation will get better in the near future, Josef was telling me btrfs was probably going to end up with its own DIO code (replacing a good chunk of direct-io.c) and that should get a lot easier and saner for you guys with my dio rewrite. Need to finish getting that to pass xfstests... Really though, the new stuff is better, thanks. Thanks! :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Thu, Nov 07, 2013 at 11:17:22AM -0800, Olof Johansson wrote: On Sat, Nov 2, 2013 at 1:50 PM, Dave Kleikamp dave.kleik...@oracle.com wrote: On 11/01/2013 03:53 PM, Jens Axboe wrote: On 11/01/2013 02:41 PM, Dave Kleikamp wrote: On 11/01/2013 03:27 PM, Jens Axboe wrote: On 11/01/2013 02:22 PM, Stephen Rothwell wrote: Hi Jens, On Fri, 01 Nov 2013 09:10:43 -0600 Jens Axboe ax...@kernel.dk wrote: On 10/31/2013 09:20 PM, Stephen Rothwell wrote: Today's linux-next merge of the block tree got a conflict in drivers/block/loop.c between commit 2486740b52fd (loop: use aio to perform io on the underlying file) from the aio-direct tree and commit ed2d2f9a8265 (block: Abstract out bvec iterator) from the block tree. I fixed it up (I think - see below - I have also attached the final resulting file) and can carry the fix as necessary (no action is required). What tree is this from? It'd be a lot more convenient to fold that loop patch into my tree, especially since the block tree in linux-next failed after this merge. I can only agree with you. It is from the aio-direct tree (probably misnamed by me) (git://github.com/kleikamp/linux-shaggy.git#for-next) run by Dave Kleikamp. Dave, input requested. In any case, I would suggest dropping the aio-direct tree instead of the entire block tree for coverage purposes, if merge or build failures happen because of it. I've had these patches in linux-next since August, and I'd really like to push them in the 3.13 merge window. Are there other problems besides this merge issue? I'll take a closer look at Stephen's merge patch and see if I find any other issues, but I really don't want to pull these patches out of linux-next now. I'm not saying that the patches should be dropped or not go into 3.13. What I'm saying is that if the choice is between having the bio and blk-mq stuff in linux-next or an addon to the loop driver, the decision should be quite clear. So we've three immediate options: 1) You base it on top of the block tree 2) I carry the loop updates 3) You hand Stephen a merge patch for the resulting merge of the two Attached is a merge patch and the merged loop.c. I'm having problems with the loop driver with both the block and my tree. I'll continue to look at that, but everything should build cleanly with this. Hijacking(?) this thread since it seems relevant: I noticed the following panic on a chromebox with last night's next. 20131106 shows it as well. I didn't go back further to see. 3.12 runs fine. I bisected it down, and unfortunately it points at Stephen's merge commit: commit 3caa8f38e7eeb56c7d48b0d5c323ffbf4939635d Merge: 447b374 bb6f7be Author: Stephen Rothwell s...@canb.auug.org.au Date: Thu Nov 7 14:07:20 2013 +1100 Merge remote-tracking branch 'aio-direct/for-next' Conflicts: drivers/block/loop.c fs/nfs/direct.c fs/nfs/file.c include/linux/blk_types.h ... but the branch alone runs fine. Context to the failure: Userspace is already up and running. ChromeOS will do ecryptfs and loopback mounts, etc, which is likely where this is hitting given the process running. It definitely happens during early userspace setup. That looks like the bi_remaining BUG_ON() in bio_endio(), probably related to the loopback driver. I'll start looking at the code soon as I get into the office, this one should be easy to track down. Seems like we might be in for a bumpy ride in 3.13 w.r.t. block if the breakage we've found this week in -next is any indication. This seems to be reliably reproducing for me so I can help collect data if needed, Dave/Jens. [3.373979] EXT4-fs (sda1): mounted filesystem with ordered data mode. Opts: commit=600 [3.385719] EXT4-fs (sda8): mounted filesystem with ordered data mode. Opts: commit=600 [3.475540] bio: create slab bio-1 at 1 [3.483577] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: discard,commit=600 [3.556890] EXT4-fs (sda1): re-mounted. Opts: commit=600,data=ordered [3.636658] [ cut here ] [3.641345] kernel BUG at /mnt/host/source/src/third_party/kernel-next/fs/bio.c:1725! [3.649266] invalid opcode: [#1] SMP [3.653473] Modules linked in: [3.656610] CPU: 0 PID: 107 Comm: loop0 Tainted: GW 3.12.0-next-20131107 #6 [3.664645] Hardware name: SAMSUNG Stumpy, BIOS Google_Stumpy.2183.0.2012_05_01_1303 05/01/2012 [3.673463] task: 88010001e250 ti: 880074c7e000 task.ti: 880074c7e000 [3.681023] RIP: 0010:[8111229d] [8111229d] bio_endio+0x13/0x59 [3.67] RSP: 0018:880074c7fc50 EFLAGS: 00010246 [3.694272] RAX: RBX: 880074cb6120 RCX: [3.701496] RDX: fffb RSI: fffb RDI: 880074c4b000 [3.708728]
Re: linux-next: manual merge of the block tree with the tree
On Thu, Nov 07, 2013 at 01:20:26PM -0600, Dave Kleikamp wrote: On 11/02/2013 03:50 PM, Dave Kleikamp wrote: On 11/01/2013 03:53 PM, Jens Axboe wrote: So we've three immediate options: 1) You base it on top of the block tree 2) I carry the loop updates 3) You hand Stephen a merge patch for the resulting merge of the two Attached is a merge patch and the merged loop.c. I'm having problems with the loop driver with both the block and my tree. I'll continue to look at that, but everything should build cleanly with this. Looking back, I obviously rushed the last patch out. This merge patch, and the resulting loop.c, fix my problem. My code is working with Jens' block tree now. Jens, I ended up replacing a call to bio_iovec_idx() with __bvec_iter_bvec() since the former was removed. It's not very elegant, but it works. I'm open to suggestions on a cleaner fix, but it can wait until one or both of these trees is merged. No, that's definitely wrong. Read Documentation/block/biovecs.txt - you need to use either the new bio_iovec() or bio_iovec() iter. I can do the conversion later today. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/9] block: Convert various code to bio_for_each_segment()
On Thu, Nov 07, 2013 at 12:26:30PM +0100, Jan Kara wrote: On Mon 04-11-13 15:36:19, Kent Overstreet wrote: With immutable biovecs we don't want code accessing bi_io_vec directly - the uses this patch changes weren't incorrect since they all own the bio, but it makes the code harder to audit for no good reason - also, this will help with multipage bvecs later. I think you've missed the code in fs/ext4/page-io.c in the conversion (likely because it was added relatively recently). Thanks! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Fri, Nov 08, 2013 at 12:53:07PM +1100, Stephen Rothwell wrote: Hi all, On Thu, 07 Nov 2013 18:04:57 -0600 Dave Kleikamp dave.kleik...@oracle.com wrote: Can you please drop the aio-direct tree for the time being? OK, I was afraid of this, but, yes, I can drop it. I am not quite sure what affect this will have on Andrew's tree, though (hopefully not too much). This is a bit disappointing. Dave's stuff have been sitting in linux-next for quite some time (probably too long) so it is not as if Kent and Jens (should) have been unaware of it. Yeah, I have to apologize for not noticing sooner. Mea culpa. That said, we spent some time hashing things out on IRC and I'm pretty excited for the new plan; Zach Brown also had some good input. My stuff is not aligning very well with the immutable biovecs and Kent has a different approach in mind. I'll be working with him on a replacement that will hopefully be simpler. Presumably not before v3.14, right? Yeah, the gist of it is that loopback driver is going to be passing bios directly to the dio code - my dio rewrite gets us most of the way there, and after immutable biovecs there's only ~3 (much simpler) patches left that the dio rewrite depends on. The remaining issue to deal with is the fact that on writes, the dio code bails out if it hits an unmapped block (i.e. a hole) and filemap.c finishes it up with the buffered io code - we need the dio code to handle that in a self contained fashion. Zach had a (horribly ugly, but definitely workable) idea for that, so I'm feeling pretty optimistic about this right now. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Thu, Nov 07, 2013 at 11:33:24PM -0800, Christoph Hellwig wrote: The changes for direct I/O from kernel space have been in for a long time, and they are blocking multiple consumers of the interface from getting submitted for about a year now. Even if the guts of the direct-io code will get a write based on another patchset from Kent that will go on top of the immutable iovec changes we need the interfaces now and not another year down the road. What else is blocked on this patch series? Honest question. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Thu, Nov 07, 2013 at 11:44:45PM -0800, Christoph Hellwig wrote: On Thu, Nov 07, 2013 at 11:39:59PM -0800, Kent Overstreet wrote: On Thu, Nov 07, 2013 at 11:33:24PM -0800, Christoph Hellwig wrote: The changes for direct I/O from kernel space have been in for a long time, and they are blocking multiple consumers of the interface from getting submitted for about a year now. Even if the guts of the direct-io code will get a write based on another patchset from Kent that will go on top of the immutable iovec changes we need the interfaces now and not another year down the road. What else is blocked on this patch series? Honest question. From me alone: Support for ecryptfs to not double cache. AIO support for target_core_file. To replace code in staging: Removal of the lustre-specific loop driver. And I remember others claiming to want to submit bits, too. So, I don't think the iov_iter stuff is the right approach for solving the loop issue; it's an ugly hack and after immutable biovecs we're pretty close to a better solution and some major cleanups too. I don't know about ecryptfs and AIO for target, though - are there patches for those you could point me at so I can have a look? I can believe the iov_iter stuff is necessary for those, but I'd like to convince myself there isn't a cleaner solution. Regardless, I don't want to be blocking anyone else's work; if we do want the iov_iter stuff in now (I'm not arguing one way or the other till I look at the issues you pointed out) I can write a small patch to correctly merge with immutable bvecs; I looked at it today and it's pretty straightforward (if somewhat ugly). -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Fri, Nov 08, 2013 at 12:02:21AM -0800, Christoph Hellwig wrote: On Thu, Nov 07, 2013 at 11:56:17PM -0800, Kent Overstreet wrote: So, I don't think the iov_iter stuff is the right approach for solving the loop issue; it's an ugly hack and after immutable biovecs we're pretty close to a better solution and some major cleanups too. All the consumers aren't limited to a block-based filesystem backing, including loop. So we need a file-ops based approach for in-kernel dio/aio. If you have a counter proposal please at least describe it. The core issue isn't whether the IO is going to a block based filesystem (but thanks for pointing out that that's not necessarily true!) but whether we want to work with pinned pages or not. If pinned pages are ok for everything, then bios as a common interface work - likely evolving them a bit to be more general (it's just bi_bdev and bi_sector that's actually block specific) - and IMO that would be far preferable to this abstraction layer. If OTOH we need a common interface that's also for places where we can't afford the overhead of pinning user pages - that's a different story, and maybe we do need all this infrastructure then. That's why I'm asking about the stuff you meantioned, I'm honestly not sure. What I'm working towards though is a clean separation between buffered and direct code paths, so that buffered IO can continue work with iovs and for O_DIRECT the first thing you do is fill out a bio with pinned pages and send it down to filesystem code or wherever it's going to go. That make sense? I can show you more concretely what I'm working on if you want. Or if I'm full of crap and this is useless for what you guys want I'm sure you'll let me know :) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the block tree with the tree
On Fri, Nov 08, 2013 at 12:32:51AM -0800, Christoph Hellwig wrote: On Fri, Nov 08, 2013 at 12:17:37AM -0800, Kent Overstreet wrote: The core issue isn't whether the IO is going to a block based filesystem (but thanks for pointing out that that's not necessarily true!) but whether we want to work with pinned pages or not. If pinned pages are ok for everything, then bios as a common interface work - likely evolving them a bit to be more general (it's just bi_bdev and bi_sector that's actually block specific) - and IMO that would be far preferable to this abstraction layer. If OTOH we need a common interface that's also for places where we can't afford the overhead of pinning user pages - that's a different story, and maybe we do need all this infrastructure then. That's why I'm asking about the stuff you meantioned, I'm honestly not sure. For both of them we will deal with kernel-allocated pages that are never mapped to userspace. This is likely to be true for all the consumers of in-kernel aio/dio as the existing interfaces handle user pages just fine. Ok, that's good to know. What I'm working towards though is a clean separation between buffered and direct code paths, so that buffered IO can continue work with iovs and for O_DIRECT the first thing you do is fill out a bio with pinned pages and send it down to filesystem code or wherever it's going to go. I don't think pushing bios above the fs interface is a good idea. Note that the iovecs come from userspace for the user pages cases, so there is little we can do about that, and non-bio based direct I/O implementations generally work directly at just that level and never even touch the direct-io.c code. Bios can point to userspage pages just fine (and they do today for DIO to block devices/block based filesystems today). Don't think of bios as block device IOs, just think of them as the equivalent of an iovec + iov_iter except instead of (potentially userspace) pointers you have page pointers. That's the core part of what they do (and even if we don't standardize on bios for that we should standardize on _something_ for that functionality). Here's the helper function I wrote for my dio rewrite - it should really take an iov_iter instead of uaddr and len, but user iovec - bio is the easy bit: http://evilpiepirate.org/git/linux-bcache.git/commit/?h=block_stuffid=4462c03167767c656986afaf981f891705fd5d3b If you want to redo the -direct_IO address_space operation and generic_file_direct_write and the direct I/O side of generic_file_aio_read (both of which aren't anywhere near as generic as the name claims) I'm all for it, but it really won't affect the consumer of the in-kernel aio/dio code. I'm skeptical, but I'm way too tired to make good arguments and this touches on too much code that I'm less familiar with. also the flow of control in this code is such a goddamn clusterfuck I don't even know what to say. I'll dig more into the ecryptfs and target aio stuff tomorrow though. That make sense? I can show you more concretely what I'm working on if you want. Or if I'm full of crap and this is useless for what you guys want I'm sure you'll let me know :) It sounds interesting, but also a little confusing at this point, at least from the non-block side of view. Zack, you want to chime in? He was involved in the discussion yesterday, he might be able to explain this stuff better than I. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 16/22] dm: Refactor for new bio cloning/splitting
On Sat, Sep 28, 2013 at 12:59:09AM -0400, Mike Snitzer wrote: Hey Kent, I haven't been able to pinpoint the issue yet, but using your for-jens branch, if I create a dm-thin volume with this lvm command: lvcreate -L20G -V20G -T vg/pool --name thinlv and try to format /dev/vg/thinlv with XFS the kernel warns and then hangs with the following: Thanks, for whatever reason ext4 wasn't tickling that codepath but xfs does. I folded the fix into my for-jens branch, here's what changed: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b60b350..79cee1a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1098,7 +1098,7 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio, bio_integrity_clone(clone, bio, GFP_NOIO); bio_advance(clone, to_bytes(sector - clone-bi_iter.bi_sector)); - bio-bi_iter.bi_size = to_bytes(len); + clone-bi_iter.bi_size = to_bytes(len); if (bio_integrity(bio)) bio_integrity_trim(clone, 0, len); @@ -1267,7 +1267,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) if (!dm_target_is_valid(ti)) return -EIO; - len = min_t(unsigned, max_io_len(ci-sector, ti), bio_sectors(bio)); + len = min_t(sector_t, max_io_len(ci-sector, ti), ci-sector_count); __clone_and_map_data_bio(ci, ti, ci-sector, len); -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 16/22] dm: Refactor for new bio cloning/splitting
On Wed, Oct 02, 2013 at 11:23:21PM -0400, Mike Snitzer wrote: Great, thanks for finding this, I'll test and review the code further. Cool - let me know if you find anything else (how thoroughly do you think you've tested it so far?) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: submit_bio_wait() conversions
It was being open coded in a few places. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Joern Engel jo...@logfs.org Cc: Prasad Joshi prasadjoshi.li...@gmail.com Cc: Neil Brown ne...@suse.de Cc: Chris Mason chris.ma...@fusionio.com --- block/blk-flush.c | 19 +-- drivers/md/md.c| 12 +--- fs/btrfs/check-integrity.c | 32 +--- fs/btrfs/check-integrity.h | 2 ++ fs/btrfs/extent_io.c | 12 +--- fs/btrfs/scrub.c | 33 - fs/hfsplus/wrapper.c | 17 + fs/logfs/dev_bdev.c| 13 + 8 files changed, 24 insertions(+), 116 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 331e627..fb6f3c0 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, bio-bi_flags); - if (bio-bi_private) - complete(bio-bi_private); - bio_put(bio); -} - /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for @@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err) int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret = 0; @@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio-bi_end_io = bio_end_flush; bio-bi_bdev = bdev; - bio-bi_private = wait; - bio_get(bio); - submit_bio(WRITE_FLUSH, bio); - wait_for_completion_io(wait); + ret = submit_bio_wait(WRITE_FLUSH, bio); /* * The driver must store the error location in -bi_sector, if @@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (error_sector) *error_sector = bio-bi_sector; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); return ret; } diff --git a/drivers/md/md.c b/drivers/md/md.c index b6b7a28..8700de3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -776,16 +776,10 @@ void md_super_wait(struct mddev *mddev) finish_wait(mddev-sb_wait, wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio-bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev-mddev); - struct completion event; int ret; rw |= REQ_SYNC; @@ -801,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio-bi_sector = sector + rdev-data_offset; bio_add_page(bio, page, size, 0); - init_completion(event); - bio-bi_private = event; - bio-bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, bio-bi_flags); bio_put(bio); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b50764b..131d828 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, for (i = 0; i num_pages;) { struct bio *bio; unsigned int j; - DECLARE_COMPLETION_ONSTACK(complete); bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { @@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio-bi_bdev = block_ctx-dev-bdev; bio-bi_sector = dev_bytenr 9; - bio-bi_end_io = btrfsic_complete_bio_end_io; - bio-bi_private = complete; for (j = i; j num_pages; j++) { ret = bio_add_page(bio, block_ctx-pagev[j], @@ -1712,12 +1708,7 @@ static int btrfsic_read_block
[GIT PULL] Immutable biovecs
Jens - here's immutable biovecs, rebased and ready for 3.14. Changes since the last version of the series: * bio_clone_bioset() retains the old behaviour, as previously discussed - bio_clone_fast() is being used by bcache, dm and the new bio_split(). There aren't that many users of the old bio_clone() and we should eventually convert all the users that don't actually need it (for a couple of reasons), but that can be left to future cleanups. * refactoring patches to use submit_bio_wait() and bio_for_each_segment() Regarding the bio chaining stuff, I'm not terribly happy with all the bio_endio_nodec() calls but I want to keep that stuff simple for now - Mike Snitzer had a good suggestion for that though. But I am seeing more and more ways we can use that functionality to clean other stuff up - also, we should be able to drop bi_cnt eventually, bi_remaining seems like it can be used to replace the various bio_get() uses. The following changes since commit 6ce4eac1f600b34f2f7f58f9cd8f0503d79e42ae: Linux 3.13-rc1 (2013-11-22 11:30:55 -0800) are available in the git repository at: git://evilpiepirate.org/~kent/linux-bcache.git for-jens for you to fetch changes up to 4b1faf931650d4a35b2a570318862821d6a962e3: block: Kill bio_pair_split() (2013-11-23 22:33:57 -0800) Kent Overstreet (25): block: submit_bio_wait() conversions block: Convert various code to bio_for_each_segment() bcache: Kill unaligned bvec hack block: Abstract out bvec iterator dm: Use bvec_iter for dm_bio_record() block: Convert bio_iovec() to bvec_iter block: Convert bio_for_each_segment() to bvec_iter block: Immutable bio vecs block: Convert bio_copy_data() to bvec_iter bio-integrity: Convert to bvec_iter block: Kill bio_segments()/bi_vcnt usage block: Convert drivers to immutable biovecs aoe: Convert to immutable biovecs ceph: Convert to immutable biovecs block: Kill bio_iovec_idx(), __bio_iovec() block: Refactor bio_clone_bioset() for immutable biovecs block: Add bio_clone_fast() rbd: Refactor bio cloning dm: Refactor for new bio cloning/splitting block: Don't save/copy bvec array anymore block: Remove bi_idx hacks block: Generic bio chaining block: Rename bio_split() - bio_pair_split() block: Introduce new bio_split() block: Kill bio_pair_split() Documentation/block/biodoc.txt | 7 +- Documentation/block/biovecs.txt | 111 ++ arch/m68k/emu/nfblock.c | 13 +- arch/powerpc/sysdev/axonram.c | 21 +- block/blk-core.c| 40 +-- block/blk-flush.c | 21 +- block/blk-integrity.c | 40 ++- block/blk-lib.c | 12 +- block/blk-map.c | 6 +- block/blk-merge.c | 66 ++-- block/blk-mq.c | 2 +- block/blk-throttle.c| 14 +- block/elevator.c| 2 +- drivers/block/aoe/aoe.h | 10 +- drivers/block/aoe/aoecmd.c | 153 - drivers/block/brd.c | 16 +- drivers/block/drbd/drbd_actlog.c| 2 +- drivers/block/drbd/drbd_bitmap.c| 2 +- drivers/block/drbd/drbd_main.c | 27 +- drivers/block/drbd/drbd_receiver.c | 19 +- drivers/block/drbd/drbd_req.c | 6 +- drivers/block/drbd/drbd_req.h | 2 +- drivers/block/drbd/drbd_worker.c| 8 +- drivers/block/floppy.c | 16 +- drivers/block/loop.c| 27 +- drivers/block/mtip32xx/mtip32xx.c | 20 +- drivers/block/nbd.c | 14 +- drivers/block/nvme-core.c | 142 ++-- drivers/block/pktcdvd.c | 182 +- drivers/block/ps3disk.c | 17 +- drivers/block/ps3vram.c | 12 +- drivers/block/rbd.c | 91 + drivers/block/rsxx/dev.c| 6 +- drivers/block/rsxx/dma.c| 15 +- drivers/block/umem.c| 53 ++- drivers/block/xen-blkback/blkback.c | 2 +- drivers/block/xen-blkfront.c| 2 +- drivers/md/bcache/bcache.h | 2 - drivers/md/bcache/btree.c | 8 +- drivers/md/bcache/debug.c | 21 +- drivers/md/bcache/io.c | 196 ++- drivers/md/bcache/journal.c | 12 +- drivers/md/bcache/movinggc.c| 4 +- drivers/md/bcache/request.c | 131 +++- drivers/md/bcache/super.c | 20 +- drivers/md/bcache/util.c
[PATCH] Make generic_make_request() handle arbitrary size bios
This builds off of immutable biovecs; it's a small enough patch series that I'd like to try to get it into 3.14 but it is a pretty significant change in behaviour for the block layer so it should definitely be considered separately from that series. What the series does is pretty simple - like the title says, you can now pass generic_make_request arbitrary size bios - you no longer have to care about device limits, and we can start deleting a ton of code. This is done by splitting bios as needed. The idea is to push the splitting lower and lower over time, and in many cases we eventually won't have to actually split the bios at all - it'll happen implicitly - for now though, we do it the simple easy way, with a new function (blk_queue_split()) that mimics the old behaviour of bio_add_page(). This series only starts with the obvious easy things - bio_add_page(), and the stuff in blk-lib.c. There's a lot more coming - with the new bio_split() that can split arbitrary size bios the various virtual block drivers (md, dm, drbd, etc.) should all either be capable of handling arbitrary size bios now or mostly need minor changes, so we can delete merge_bvec_fn. This patch series doesn't with REQ_TYPE_BLOCK_PC bios - bio_add_pc_page() and the blk_execute_rq() codepath are unchanged. I'd like to get rid of direct request submission eventually, but that is a different patch series. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/5] block: Make generic_make_request handle arbitrary sized bios
The way the block layer is currently written, it goes to great lengths to avoid having to split bios; upper layer code (such as bio_add_page()) checks what the underlying device can handle and tries to always create bios that don't need to be split. But this approach becomes unwieldy and eventually breaks down with stacked devices and devices with dynamic limits, and it adds a lot of complexity. If the block layer could split bios as needed, we could eliminate a lot of complexity elsewhere - particularly in stacked drivers. Code that creates bios can then create whatever size bios are convenient, and more importantly stacked drivers don't have to deal with both their own bio size limitations and the limitations of the (potentially multiple) devices underneath them. In the future this will let us delete merge_bvec_fn and a bunch of other code. We do this by adding calls to blk_queue_split() to the various make_request functions that need it - a few can already handle arbitrary size bios. Note that we add the call _after_ any call to blk_queue_bounce(); this means that blk_queue_split() and blk_recalc_rq_segments() don't need to be concerned with bouncing affecting segment merging. Some make_request_fns were simple enough to audit and verify they don't need blk_queue_split() calls. The skipped ones are: * nfhd_make_request (arch/m68k/emu/nfblock.c) * axon_ram_make_request (arch/powerpc/sysdev/axonram.c) * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c) * brd_make_request (ramdisk - drivers/block/brd.c) * loop_make_request * null_queue_bio * bcache's make_request fns Some others are almost certainly safe to remove now, but will be left for future patches. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Neil Brown ne...@suse.de Cc: Alasdair Kergon a...@redhat.com Cc: dm-de...@redhat.com --- block/blk-core.c| 19 ++-- block/blk-merge.c | 150 ++-- block/blk-mq.c | 2 + drivers/block/drbd/drbd_req.c | 2 + drivers/block/mtip32xx/mtip32xx.c | 6 +- drivers/block/nvme-core.c | 2 + drivers/block/pktcdvd.c | 6 +- drivers/block/ps3vram.c | 2 + drivers/block/rsxx/dev.c| 2 + drivers/block/umem.c| 2 + drivers/md/dm.c | 2 + drivers/md/md.c | 2 + drivers/s390/block/dcssblk.c| 2 + drivers/s390/block/xpram.c | 2 + drivers/staging/lustre/lustre/llite/lloop.c | 2 + drivers/staging/zram/zram_drv.c | 2 + include/linux/blkdev.h | 3 + 17 files changed, 185 insertions(+), 23 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5da8e90..5cc7171 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -566,6 +566,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q-id 0) goto fail_c; + q-bio_split = bioset_create(4, 0); + if (!q-bio_split) + goto fail_id; + q-backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q-backing_dev_info.state = 0; @@ -575,7 +579,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) err = bdi_init(q-backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(q-backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -620,6 +624,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_bdi: bdi_destroy(q-backing_dev_info); +fail_split: + bioset_free(q-bio_split); fail_id: ida_simple_remove(blk_queue_ida, q-id); fail_c: @@ -1472,6 +1478,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) struct request *req; unsigned int request_count = 0; + blk_queue_split(q, bio, q-bio_split); + /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1694,15 +1702,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) - nr_sectors queue_max_hw_sectors(q))) { - printk(KERN_ERR bio too big device %s (%u %u)\n, - bdevname(bio-bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio-bi_bdev-bd_part; if (should_fail_request(part, bio-bi_iter.bi_size) || should_fail_request(part_to_disk(part)-part0, diff --git a/block/blk-merge.c b/block/blk
[PATCH 4/5] blk-lib.c: generic_make_request() handles large bios now
generic_make_request() will now do for us what the code in blk-lib.c was doing manually, with the bio_batch stuff - we still need some looping in case we're trying to discard/zeroout more than around a gigabyte, but when we can submit that much at a time doing the submissions in parallel really shouldn't matter. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- block/blk-lib.c | 175 ++-- 1 file changed, 30 insertions(+), 145 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 2da76c9..368c36a 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -9,23 +9,6 @@ #include blk.h -struct bio_batch { - atomic_tdone; - unsigned long flags; - struct completion *wait; -}; - -static void bio_batch_end_io(struct bio *bio, int err) -{ - struct bio_batch *bb = bio-bi_private; - - if (err (err != -EOPNOTSUPP)) - clear_bit(BIO_UPTODATE, bb-flags); - if (atomic_dec_and_test(bb-done)) - complete(bb-wait); - bio_put(bio); -} - /** * blkdev_issue_discard - queue a discard * @bdev: blockdev to issue discard for @@ -40,15 +23,10 @@ static void bio_batch_end_io(struct bio *bio, int err) int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; - int alignment; - struct bio_batch bb; struct bio *bio; int ret = 0; - struct blk_plug plug; if (!q) return -ENXIO; @@ -56,78 +34,28 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!blk_queue_discard(q)) return -EOPNOTSUPP; - /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q-limits.discard_granularity 9, 1U); - alignment = (bdev_discard_alignment(bdev) 9) % granularity; - - /* -* Ensure that max_discard_sectors is of the proper -* granularity, so that requests stay aligned after a split. -*/ - max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; type |= REQ_SECURE; } - atomic_set(bb.done, 1); - bb.flags = 1 BIO_UPTODATE; - bb.wait = wait; - - blk_start_plug(plug); while (nr_sects) { - unsigned int req_sects; - sector_t end_sect, tmp; - bio = bio_alloc(gfp_mask, 1); - if (!bio) { - ret = -ENOMEM; - break; - } - - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); - - /* -* If splitting a request, and the next starting sector would be -* misaligned, stop the discard at the previous aligned sector. -*/ - end_sect = sector + req_sects; - tmp = end_sect; - if (req_sects nr_sects - sector_div(tmp, granularity) != alignment) { - end_sect = end_sect - alignment; - sector_div(end_sect, granularity); - end_sect = end_sect * granularity + alignment; - req_sects = end_sect - sector; - } + if (!bio) + return -ENOMEM; - bio-bi_iter.bi_sector = sector; - bio-bi_end_io = bio_batch_end_io; bio-bi_bdev = bdev; - bio-bi_private = bb; + bio-bi_iter.bi_sector = sector; + bio-bi_iter.bi_size = min_t(sector_t, nr_sects, 1 20) 9; - bio-bi_iter.bi_size = req_sects 9; - nr_sects -= req_sects; - sector = end_sect; + sector += bio_sectors(bio); + nr_sects -= bio_sectors(bio); - atomic_inc(bb.done); - submit_bio(type, bio); + ret = submit_bio_wait(type, bio); + if (ret) + break; } - blk_finish_plug(plug); - - /* Wait for bios in-flight */ - if (!atomic_dec_and_test(bb.done)) - wait_for_completion_io(wait); - - if (!test_bit(BIO_UPTODATE, bb.flags)) - ret = -EIO; return
[PATCH 2/5] mtip32xx: handle arbitrary size bios
We get a measurable performance increase by handling this in the driver when we're already looping over the biovec, instead of handling it separately in generic_make_request() (or bio_add_page() originally) Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Asai Thambi S P asamymuth...@micron.com Cc: Sam Bradshaw sbrads...@micron.com --- drivers/block/mtip32xx/mtip32xx.c | 47 --- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1b3dfee..2b142eb 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2648,24 +2648,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t sector, } /* - * Release a command slot. - * - * @dd Pointer to the driver data structure. - * @tag Slot tag - * - * return value - * None - */ -static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag, - int unaligned) -{ - struct semaphore *sem = unaligned ? dd-port-cmd_slot_unal : - dd-port-cmd_slot; - release_slot(dd-port, tag); - up(sem); -} - -/* * Obtain a command slot and return its associated scatter list. * * @dd Pointer to the driver data structure. @@ -3969,8 +3951,6 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) blk_queue_bounce(queue, bio); - blk_queue_split(queue, bio, q-bio_split); - if (unlikely(dd-dd_flag MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, dd-dd_flag))) { @@ -4020,21 +4000,22 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) sg = mtip_hw_get_scatterlist(dd, tag, unaligned); if (likely(sg != NULL)) { - if (unlikely((bio)-bi_vcnt MTIP_MAX_SG)) { - dev_warn(dd-pdev-dev, - Maximum number of SGL entries exceeded\n); - bio_io_error(bio); - mtip_hw_release_scatterlist(dd, tag, unaligned); - return; - } - /* Create the scatter list for this bio. */ bio_for_each_segment(bvec, bio, iter) { - sg_set_page(sg[nents], - bvec.bv_page, - bvec.bv_len, - bvec.bv_offset); - nents++; + if (unlikely(nents == MTIP_MAX_SG)) { + struct bio *split = bio_clone(bio, GFP_NOIO); + + split-bi_iter = iter; + bio-bi_iter.bi_size -= iter.bi_size; + bio_chain(split, bio); + generic_make_request(split); + break; + } + + sg_set_page(sg[nents++], + bvec.bv_page, + bvec.bv_len, + bvec.bv_offset); } /* Issue the read/write. */ -- 1.8.4.3 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/5] block: Gut bio_add_page()
Since generic_make_request() can now handle arbitrary size bios, all we have to do is make sure the bvec array doesn't overflow. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- fs/bio.c | 137 ++- 1 file changed, 57 insertions(+), 80 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index 75c49a3..da27557 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -690,9 +690,23 @@ int bio_get_nr_vecs(struct block_device *bdev) } EXPORT_SYMBOL(bio_get_nr_vecs); -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned int max_sectors) +/** + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset) { int retried_segments = 0; struct bio_vec *bvec; @@ -703,7 +717,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio-bi_iter.bi_size + len) 9) max_sectors) + if (((bio-bi_iter.bi_size + len) 9) queue_max_hw_sectors(q)) return 0; /* @@ -716,28 +730,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev-bv_page offset == prev-bv_offset + prev-bv_len) { - unsigned int prev_bv_len = prev-bv_len; prev-bv_len += len; - - if (q-merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio-bi_bdev, - .bi_sector = bio-bi_iter.bi_sector, - .bi_size = bio-bi_iter.bi_size - - prev_bv_len, - .bi_rw = bio-bi_rw, - }; - - if (q-merge_bvec_fn(q, bvm, prev) prev-bv_len) { - prev-bv_len -= len; - return 0; - } - } - goto done; } } @@ -768,31 +761,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page bvec-bv_len = len; bvec-bv_offset = offset; - /* -* if queue has other restrictions (eg varying max sector size -* depending on offset), it can specify a merge_bvec_fn in the -* queue to get further control -*/ - if (q-merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio-bi_bdev, - .bi_sector = bio-bi_iter.bi_sector, - .bi_size = bio-bi_iter.bi_size, - .bi_rw = bio-bi_rw, - }; - - /* -* merge_bvec_fn() returns number of bytes it can accept -* at this offset -*/ - if (q-merge_bvec_fn(q, bvm, bvec) bvec-bv_len) { - bvec-bv_page = NULL; - bvec-bv_len = 0; - bvec-bv_offset = 0; - return 0; - } - } - /* If we may be able to merge these biovecs, force a recount */ if (bio-bi_vcnt (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio-bi_flags = ~(1 BIO_SEG_VALID); @@ -803,28 +771,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page bio-bi_iter.bi_size += len; return len; } - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist
[PATCH 5/5] bcache: generic_make_request() handles large bios now
So we get to delete our hacky workaround. Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/md/bcache/bcache.h| 18 drivers/md/bcache/io.c| 100 +- drivers/md/bcache/journal.c | 4 +- drivers/md/bcache/request.c | 16 +++ drivers/md/bcache/super.c | 32 +- drivers/md/bcache/util.h | 5 ++- drivers/md/bcache/writeback.c | 4 +- 7 files changed, 18 insertions(+), 161 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 964353c..8f65331 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -241,19 +241,6 @@ struct keybuf { DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; -struct bio_split_pool { - struct bio_set *bio_split; - mempool_t *bio_split_hook; -}; - -struct bio_split_hook { - struct closure cl; - struct bio_split_pool *p; - struct bio *bio; - bio_end_io_t*bi_end_io; - void*bi_private; -}; - struct bcache_device { struct closure cl; @@ -286,8 +273,6 @@ struct bcache_device { int (*cache_miss)(struct btree *, struct search *, struct bio *, unsigned); int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); - - struct bio_split_pool bio_split_hook; }; struct io { @@ -465,8 +450,6 @@ struct cache { atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; - - struct bio_split_pool bio_split_hook; }; struct gc_stat { @@ -901,7 +884,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); -void bch_generic_make_request(struct bio *, struct bio_split_pool *); void __bch_submit_bbio(struct bio *, struct cache_set *); void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fa028fa..86a0bb8 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -11,104 +11,6 @@ #include linux/blkdev.h -static unsigned bch_bio_max_sectors(struct bio *bio) -{ - struct request_queue *q = bdev_get_queue(bio-bi_bdev); - struct bio_vec bv; - struct bvec_iter iter; - unsigned ret = 0, seg = 0; - - if (bio-bi_rw REQ_DISCARD) - return min(bio_sectors(bio), q-limits.max_discard_sectors); - - bio_for_each_segment(bv, bio, iter) { - struct bvec_merge_data bvm = { - .bi_bdev= bio-bi_bdev, - .bi_sector = bio-bi_iter.bi_sector, - .bi_size= ret 9, - .bi_rw = bio-bi_rw, - }; - - if (seg == min_t(unsigned, BIO_MAX_PAGES, -queue_max_segments(q))) - break; - - if (q-merge_bvec_fn - q-merge_bvec_fn(q, bvm, bv) (int) bv.bv_len) - break; - - seg++; - ret += bv.bv_len 9; - } - - ret = min(ret, queue_max_sectors(q)); - - WARN_ON(!ret); - ret = max_t(int, ret, bio_iovec(bio).bv_len 9); - - return ret; -} - -static void bch_bio_submit_split_done(struct closure *cl) -{ - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - s-bio-bi_end_io = s-bi_end_io; - s-bio-bi_private = s-bi_private; - bio_endio_nodec(s-bio, 0); - - closure_debug_destroy(s-cl); - mempool_free(s, s-p-bio_split_hook); -} - -static void bch_bio_submit_split_endio(struct bio *bio, int error) -{ - struct closure *cl = bio-bi_private; - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - - if (error) - clear_bit(BIO_UPTODATE, s-bio-bi_flags); - - bio_put(bio); - closure_put(cl); -} - -void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) -{ - struct bio_split_hook *s; - struct bio *n; - - if (!bio_has_data(bio) !(bio-bi_rw REQ_DISCARD)) - goto submit; - - if (bio_sectors(bio) = bch_bio_max_sectors(bio)) - goto submit; - - s = mempool_alloc(p-bio_split_hook, GFP_NOIO); - closure_init(s-cl, NULL); - - s-bio = bio; - s-p= p; - s-bi_end_io= bio-bi_end_io; - s-bi_private = bio-bi_private; - bio_get(bio); - - do { - n = bio_next_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s-p-bio_split); - - n-bi_end_io= bch_bio_submit_split_endio
Re: GPF in aio_migratepage
On Tue, Nov 26, 2013 at 01:01:32AM -0500, Dave Jones wrote: On Mon, Nov 25, 2013 at 10:26:45PM -0500, Dave Jones wrote: Hi Kent, I hit the GPF below on a tree based on 8e45099e029bb6b369b27d8d4920db8caff5ecce which has your commit e34ecee2ae791df674dfb466ce40692ca6218e43 (aio: Fix a trinity splat). Is this another path your patch missed, or a completely different bug to what you were chasing ? And here's another from a different path, this time on 32bit. I'm pretty sure this is a different bug... it appears to be related to aio ring buffer migration, which I don't think I've touched. Any information on what it was doing at the time? I see exit_aio() in the second backtrace, maybe some sort of race between migratepage and ioctx teardown? But it is using the address space mapping, so I dunno. I don't see what's protecting ctx-ring_pages - I imagine it's got to have something to do with the page migration machinery but I have no idea how that works. Ben? Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: tun fuse hidp rfcomm bnep scsi_transport_iscsi l2tp_ppp l2tp_netlink l2tp_core nfc caif_socket caif af_802154 phonet af_rxrpc bluetooth rfkill can_raw can_bcm can llc2 pppoe pppox ppp_generic slhc irda crc_ccitt rds af_key rose x25 atm netrom appletalk ipx p8023 p8022 psnap llc ax25 nouveau video backlight mxm_wmi wmi i2c_algo_bit ttm drm_kms_helper drm i2c_core kvm_intel kvm tg3 ptp pps_core libphy serio_raw pcspkr lpc_ich microcode mfd_core rtc_cmos parport_pc parport shpchp xfs libcrc32c raid0 floppy CPU: 0 PID: 4517 Comm: trinity-child0 Not tainted 3.13.0-rc1+ #6 Hardware name: Dell Inc. Precision WorkStation 490 /0DT031, BIOS A08 04/25/2008 task: ed899630 ti: dea22000 task.ti: dea22000 EIP: 0060:[c11c7a02] EFLAGS: 00010293 CPU: 0 EIP is at aio_migratepage+0xad/0x126 EAX: 0144 EBX: f6844ed8 ECX: deaf4a84 EDX: 6b6b6b6b ESI: f68dc508 EDI: deaf4800 EBP: dea23bcc ESP: dea23ba8 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 6b6b707b CR3: 2c985000 CR4: 07f0 Stack: 0001 deaf4a84 0286 d709b280 f68dc508 c11c7955 f6844ed8 dea23c0c c116aa9f 0001 0001 c11c7955 c1179a33 c114166d f6844ed8 f6844ed8 c1140fc9 dea23c0c f6844ed8 Call Trace: [c11c7955] ? free_ioctx+0x62/0x62 [c116aa9f] move_to_new_page+0x63/0x1bb [c11c7955] ? free_ioctx+0x62/0x62 [c1179a33] ? mem_cgroup_prepare_migration+0xc1/0x243 [c114166d] ? isolate_migratepages_range+0x3fb/0x675 [c1140fc9] ? isolate_freepages_block+0x316/0x316 [c116b319] migrate_pages+0x614/0x72b [c1140fc9] ? isolate_freepages_block+0x316/0x316 [c1141c21] compact_zone+0x294/0x475 [c1142065] try_to_compact_pages+0x129/0x196 [c15b95e7] __alloc_pages_direct_compact+0x91/0x197 [c112a25c] __alloc_pages_nodemask+0x863/0xa55 [c116b68f] get_huge_zero_page+0x52/0xf9 [c116ef78] do_huge_pmd_anonymous_page+0x24e/0x39f [c1171c4b] ? __mem_cgroup_count_vm_event+0xa6/0x191 [c1171c64] ? __mem_cgroup_count_vm_event+0xbf/0x191 [c114815c] handle_mm_fault+0x235/0xd9a [c15c7586] ? __do_page_fault+0xf8/0x5a1 [c15c75ee] __do_page_fault+0x160/0x5a1 [c15c7586] ? __do_page_fault+0xf8/0x5a1 [c15c7a2f] ? __do_page_fault+0x5a1/0x5a1 [c15c7a3c] do_page_fault+0xd/0xf [c15c4e7c] error_code+0x6c/0x74 [c114007b] ? memcg_update_all_caches+0x23/0x6b [c12d0be5] ? __copy_from_user_ll+0x30/0xdb [c12d0ccf] _copy_from_user+0x3f/0x55 [c1057aa2] SyS_setrlimit+0x27/0x50 [c1044792] ? SyS_gettimeofday+0x33/0x6d [c12d0798] ? trace_hardirqs_on_thunk+0xc/0x10 [c15cb33b] sysenter_do_call+0x12/0x32 Code: 6e 8d 8f 84 02 00 00 89 c8 89 4d e4 e8 df bf 3f 00 89 45 e8 89 da 89 f0 e8 99 2b fa ff 8b 43 08 3b 47 54 8b 4d e4 73 06 8b 57 50 89 34 82 8b 55 e8 89 c8 e8 aa c1 3f 00 8b 45 ec e8 28 c1 3f 00 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/5] block: Make generic_make_request handle arbitrary sized bios
On Mon, Nov 25, 2013 at 10:09:21PM -0800, Christoph Hellwig wrote: + q-bio_split = bioset_create(4, 0); + if (!q-bio_split) + goto fail_id; How did we arrive at a mempool size of 4 to make sure we can always make progress with arbitrarily sized bios? Shouldn't we document the design decision somewhere? It just has to be nonzero to guarantee forward progress - the bio_alloc_bioset() rescuer thing I did awhile back guarantees that. +static struct bio *blk_bio_discard_split(struct request_queue *q, +struct bio *bio, +struct bio_set *bs) +{ One fallout of this should be that it would allows us to impement a range UNMAP/TRIP much more easily, which would help dramatically with TRIM performance! Cool! -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/5] mtip32xx: handle arbitrary size bios
On Mon, Nov 25, 2013 at 10:10:18PM -0800, Christoph Hellwig wrote: On Mon, Nov 25, 2013 at 02:30:30PM -0800, Kent Overstreet wrote: We get a measurable performance increase by handling this in the driver when we're already looping over the biovec, instead of handling it separately in generic_make_request() (or bio_add_page() originally) Given that Jens has a patch queue up to convert the driver to blk-mq adjusting the blk-mq infrastructure to handle this might be a better plan forward.. I mostly wanted the patch in the series as an example conversion - converting blk-mq is going to take some thought and time to familiarize myself with the code. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: Silence spurious compiler warnings
From 46e7081430f5f483906f496733a23f8e9d898879 Mon Sep 17 00:00:00 2001 From: Kent Overstreet k...@daterainc.com Date: Tue, 26 Nov 2013 16:36:49 -0800 Subject: [PATCH] block: Silence spurious compiler warnings Signed-off-by: Kent Overstreet k...@daterainc.com --- On Tue, Nov 26, 2013 at 12:02:08PM -0700, Jens Axboe wrote: On 11/26/2013 12:01 PM, Olof Johansson wrote: I just noticed that i see this with gcc 4.7.0, but 4.8.1 does not warn. That's good, because it's not a bug. But arguably we should shut up 4.7 as well, however... Here you go: block/blk-merge.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 05c17be..0b097f6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -89,6 +89,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio_vec end_bv, nxt_bv; struct bvec_iter iter; + uninitialized_var(end_bv); + if (!blk_queue_cluster(q)) return 0; @@ -173,6 +175,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg; int nsegs, cluster; + uninitialized_var(bvprv); + nsegs = 0; cluster = blk_queue_cluster(q); @@ -235,6 +239,8 @@ int blk_bio_map_sg(struct request_queue *q, struct bio *bio, int nsegs, cluster; struct bvec_iter iter; + uninitialized_var(bvprv); + nsegs = 0; cluster = blk_queue_cluster(q); -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [GIT PULL] Immutable biovecs
On Mon, Nov 25, 2013 at 10:05:58PM -0800, Christoph Hellwig wrote: On Mon, Nov 25, 2013 at 01:52:16PM -0800, Kent Overstreet wrote: Jens - here's immutable biovecs, rebased and ready for 3.14. Changes since the last version of the series: Can you do a resend of the patch series to all involved lists first so we can have a detailed look at the current version? Sure thing. * bio_clone_bioset() retains the old behaviour, as previously discussed - bio_clone_fast() is being used by bcache, dm and the new bio_split(). Any chance to have a more descriptive name than bio_clone_fast? Also without having the actual patches in front of me, did you make sure to document the different in semantics in detail? I haven't been able to think of anything I don't hate, I'm open to suggestions. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/25] dm: Use bvec_iter for dm_bio_record()
This patch doesn't itself have any functional changes, but immutable biovecs are going to add a bi_bvec_done member to bi_iter, which will need to be saved too here. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Alasdair Kergon a...@redhat.com Cc: dm-de...@redhat.com Reviewed-by: Mike Snitzer snit...@redhat.com --- drivers/md/dm-bio-record.h | 12 +++- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 5ace48e..4f46e8e 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -28,11 +28,9 @@ struct dm_bio_vec_details { }; struct dm_bio_details { - sector_t bi_sector; struct block_device *bi_bdev; - unsigned int bi_size; - unsigned short bi_idx; unsigned long bi_flags; + struct bvec_iter bi_iter; struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; }; @@ -40,11 +38,9 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { unsigned i; - bd-bi_sector = bio-bi_iter.bi_sector; bd-bi_bdev = bio-bi_bdev; - bd-bi_size = bio-bi_iter.bi_size; - bd-bi_idx = bio-bi_iter.bi_idx; bd-bi_flags = bio-bi_flags; + bd-bi_iter = bio-bi_iter; for (i = 0; i bio-bi_vcnt; i++) { bd-bi_io_vec[i].bv_len = bio-bi_io_vec[i].bv_len; @@ -56,11 +52,9 @@ static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { unsigned i; - bio-bi_iter.bi_sector = bd-bi_sector; bio-bi_bdev = bd-bi_bdev; - bio-bi_iter.bi_size = bd-bi_size; - bio-bi_iter.bi_idx = bd-bi_idx; bio-bi_flags = bd-bi_flags; + bio-bi_iter = bd-bi_iter; for (i = 0; i bio-bi_vcnt; i++) { bio-bi_io_vec[i].bv_len = bd-bi_io_vec[i].bv_len; -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 06/25] block: Convert bio_iovec() to bvec_iter
For immutable biovecs, we'll be introducing a new bio_iovec() that uses our new bvec iterator to construct a biovec, taking into account bvec_iter-bi_bvec_done - this patch updates existing users for the new usage. Some of the existing users really do need a pointer into the bvec array - those uses are all going to be removed, but we'll need the functionality from immutable to remove them - so for now rename the existing bio_iovec() - __bio_iovec(), and it'll be removed in a couple patches. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Ed L. Cashin ecas...@coraid.com Cc: Alasdair Kergon a...@redhat.com Cc: dm-de...@redhat.com Cc: James E.J. Bottomley jbottom...@parallels.com --- drivers/block/aoe/aoecmd.c | 2 +- drivers/md/bcache/io.c | 13 +++-- drivers/md/dm-verity.c | 2 +- drivers/scsi/sd.c | 2 +- fs/bio.c | 20 ++-- include/linux/bio.h| 10 ++ 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 877ba11..77c24ab 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -932,7 +932,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio) buf-resid = bio-bi_iter.bi_size; buf-sector = bio-bi_iter.bi_sector; bio_pageinc(bio); - buf-bv = bio_iovec(bio); + buf-bv = __bio_iovec(bio); buf-bv_resid = buf-bv-bv_len; WARN_ON(buf-bv_resid == 0); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index cc4ba2d..dc44f06 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -22,11 +22,12 @@ static void bch_bi_idx_hack_endio(struct bio *bio, int error) static void bch_generic_make_request_hack(struct bio *bio) { if (bio-bi_iter.bi_idx) { + int i; + struct bio_vec *bv; struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio)); - memcpy(clone-bi_io_vec, - bio_iovec(bio), - bio_segments(bio) * sizeof(struct bio_vec)); + bio_for_each_segment(bv, bio, i) + clone-bi_io_vec[clone-bi_vcnt++] = *bv; clone-bi_iter.bi_sector = bio-bi_iter.bi_sector; clone-bi_bdev = bio-bi_bdev; @@ -97,7 +98,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (!ret) return NULL; - memcpy(ret-bi_io_vec, bio_iovec(bio), + memcpy(ret-bi_io_vec, __bio_iovec(bio), sizeof(struct bio_vec) * vcnt); break; @@ -106,7 +107,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (!ret) return NULL; - memcpy(ret-bi_io_vec, bio_iovec(bio), + memcpy(ret-bi_io_vec, __bio_iovec(bio), sizeof(struct bio_vec) * vcnt); ret-bi_io_vec[vcnt - 1].bv_len = nbytes; @@ -182,7 +183,7 @@ static unsigned bch_bio_max_sectors(struct bio *bio) ret = min(ret, queue_max_sectors(q)); WARN_ON(!ret); - ret = max_t(int, ret, bio_iovec(bio)-bv_len 9); + ret = max_t(int, ret, bio_iovec(bio).bv_len 9); return ret; } diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 132b315..5392135 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -524,7 +524,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) io-io_vec = io-io_vec_inline; else io-io_vec = mempool_alloc(v-vec_mempool, GFP_NOIO); - memcpy(io-io_vec, bio_iovec(bio), + memcpy(io-io_vec, __bio_iovec(bio), io-io_vec_size * sizeof(struct bio_vec)); verity_submit_prefetch(v, io); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e6c4bff..200d6bc 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -801,7 +801,7 @@ static int sd_setup_write_same_cmnd(struct scsi_device *sdp, struct request *rq) if (sdkp-device-no_write_same) return BLKPREP_KILL; - BUG_ON(bio_offset(bio) || bio_iovec(bio)-bv_len != sdp-sector_size); + BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp-sector_size); sector = ilog2(sdp-sector_size) - 9; nr_sectors = ilog2(sdp-sector_size) - 9; diff --git a/fs/bio.c b/fs/bio.c index a402ad6..7bb281f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -821,12 +821,12 @@ void bio_advance(struct bio *bio, unsigned bytes) break; } - if (bytes = bio_iovec(bio)-bv_len) { - bytes -= bio_iovec(bio)-bv_len; + if (bytes = bio_iovec(bio).bv_len) { + bytes -= bio_iovec(bio).bv_len
[PATCH 01/25] block: submit_bio_wait() conversions
It was being open coded in a few places. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Joern Engel jo...@logfs.org Cc: Prasad Joshi prasadjoshi.li...@gmail.com Cc: Neil Brown ne...@suse.de Cc: Chris Mason chris.ma...@fusionio.com Acked-by: NeilBrown ne...@suse.de --- block/blk-flush.c | 19 +-- drivers/md/md.c| 14 +- fs/btrfs/check-integrity.c | 32 +--- fs/btrfs/check-integrity.h | 2 ++ fs/btrfs/extent_io.c | 12 +--- fs/btrfs/scrub.c | 33 - fs/hfsplus/wrapper.c | 17 + fs/logfs/dev_bdev.c| 13 + 8 files changed, 24 insertions(+), 118 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 331e627..fb6f3c0 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q) } } -static void bio_end_flush(struct bio *bio, int err) -{ - if (err) - clear_bit(BIO_UPTODATE, bio-bi_flags); - if (bio-bi_private) - complete(bio-bi_private); - bio_put(bio); -} - /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for @@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err) int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, sector_t *error_sector) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret = 0; @@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return -ENXIO; bio = bio_alloc(gfp_mask, 0); - bio-bi_end_io = bio_end_flush; bio-bi_bdev = bdev; - bio-bi_private = wait; - bio_get(bio); - submit_bio(WRITE_FLUSH, bio); - wait_for_completion_io(wait); + ret = submit_bio_wait(WRITE_FLUSH, bio); /* * The driver must store the error location in -bi_sector, if @@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, if (error_sector) *error_sector = bio-bi_sector; - if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); return ret; } diff --git a/drivers/md/md.c b/drivers/md/md.c index b6b7a28..739b1ec 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -776,20 +776,12 @@ void md_super_wait(struct mddev *mddev) finish_wait(mddev-sb_wait, wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio-bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev-mddev); - struct completion event; int ret; - rw |= REQ_SYNC; - bio-bi_bdev = (metadata_op rdev-meta_bdev) ? rdev-meta_bdev : rdev-bdev; if (metadata_op) @@ -801,11 +793,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio-bi_sector = sector + rdev-data_offset; bio_add_page(bio, page, size, 0); - init_completion(event); - bio-bi_private = event; - bio-bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, bio-bi_flags); bio_put(bio); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b50764b..131d828 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); static void btrfsic_dump_database(struct btrfsic_state *state); -static void btrfsic_complete_bio_end_io(struct bio *bio, int err); static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages); static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, @@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, for (i = 0; i num_pages;) { struct bio *bio; unsigned int j; - DECLARE_COMPLETION_ONSTACK(complete); bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); if (!bio) { @@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio-bi_bdev = block_ctx-dev-bdev; bio-bi_sector = dev_bytenr 9; - bio-bi_end_io = btrfsic_complete_bio_end_io; - bio-bi_private = complete
[PATCH 03/25] bcache: Kill unaligned bvec hack
Bcache has a hack to avoid cloning the biovec if it's all full pages - but with immutable biovecs coming this won't be necessary anymore. For now, we remove the special case and always clone the bvec array so that the immutable biovec patches are simpler. Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/request.c | 37 +++-- drivers/md/bcache/super.c | 4 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4beb55a..6b6fe93 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -279,7 +279,6 @@ struct bcache_device { unsigned long sectors_dirty_last; longsectors_dirty_derivative; - mempool_t *unaligned_bvec; struct bio_set *bio_split; unsigneddata_csum:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fbcc851..78bab41 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -606,7 +606,6 @@ struct search { unsignedinsert_bio_sectors; unsignedrecoverable:1; - unsignedunaligned_bvec:1; unsignedwrite:1; unsignedread_dirty_data:1; @@ -614,6 +613,7 @@ struct search { struct btree_op op; struct data_insert_op iop; + struct bio_vec bv[BIO_MAX_PAGES]; }; static void bch_cache_read_endio(struct bio *bio, int error) @@ -759,10 +759,14 @@ static void bio_complete(struct search *s) static void do_bio_hook(struct search *s) { struct bio *bio = s-bio.bio; - memcpy(bio, s-orig_bio, sizeof(struct bio)); + bio_init(bio); + bio-bi_io_vec = s-bv; + bio-bi_max_vecs= BIO_MAX_PAGES; + __bio_clone(bio, s-orig_bio); bio-bi_end_io = request_endio; bio-bi_private = s-cl; + atomic_set(bio-bi_cnt, 3); } @@ -774,9 +778,6 @@ static void search_free(struct closure *cl) if (s-iop.bio) bio_put(s-iop.bio); - if (s-unaligned_bvec) - mempool_free(s-bio.bio.bi_io_vec, s-d-unaligned_bvec); - closure_debug_destroy(cl); mempool_free(s, s-d-c-search); } @@ -784,7 +785,6 @@ static void search_free(struct closure *cl) static struct search *search_alloc(struct bio *bio, struct bcache_device *d) { struct search *s; - struct bio_vec *bv; s = mempool_alloc(d-c-search, GFP_NOIO); memset(s, 0, offsetof(struct search, iop.insert_keys)); @@ -803,15 +803,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s-start_time = jiffies; do_bio_hook(s); - if (bio-bi_size != bio_segments(bio) * PAGE_SIZE) { - bv = mempool_alloc(d-unaligned_bvec, GFP_NOIO); - memcpy(bv, bio_iovec(bio), - sizeof(struct bio_vec) * bio_segments(bio)); - - s-bio.bio.bi_io_vec= bv; - s-unaligned_bvec = 1; - } - return s; } @@ -850,26 +841,13 @@ static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct bio *bio = s-bio.bio; - struct bio_vec *bv; - int i; if (s-recoverable) { /* Retry from the backing device: */ trace_bcache_read_retry(s-orig_bio); s-iop.error = 0; - bv = s-bio.bio.bi_io_vec; do_bio_hook(s); - s-bio.bio.bi_io_vec = bv; - - if (!s-unaligned_bvec) - bio_for_each_segment(bv, s-orig_bio, i) - bv-bv_offset = 0, bv-bv_len = PAGE_SIZE; - else - memcpy(s-bio.bio.bi_io_vec, - bio_iovec(s-orig_bio), - sizeof(struct bio_vec) * - bio_segments(s-orig_bio)); /* XXX: invalidate cache */ @@ -905,8 +883,7 @@ static void cached_dev_read_done(struct closure *cl) s-cache_miss = NULL; } - if (verify(dc, s-bio.bio) s-recoverable - !s-unaligned_bvec !s-read_dirty_data) + if (verify(dc, s-bio.bio) s-recoverable !s-read_dirty_data) bch_data_verify(dc, s-orig_bio); bio_complete(s); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dec15cd..1d9ee67 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -739,8 +739,6 @@ static void bcache_device_free(struct bcache_device *d) } bio_split_pool_free(d-bio_split_hook); - if (d-unaligned_bvec) - mempool_destroy(d
[PATCH 18/25] rbd: Refactor bio cloning
Now that we've got drivers converted to the new immutable bvec primitives, bio splitting becomes much easier - this is how the new bio_split() will work. (Someone more familiar with the ceph code could probably use bio_clone_fast() instead of bio_clone() here). Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Yehuda Sadeh yeh...@inktank.com Cc: Alex Elder el...@inktank.com Cc: ceph-de...@vger.kernel.org --- drivers/block/rbd.c | 64 ++--- 1 file changed, 2 insertions(+), 62 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 20e8ab3..3624368 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1173,73 +1173,13 @@ static struct bio *bio_clone_range(struct bio *bio_src, unsigned int len, gfp_t gfpmask) { - struct bio_vec bv; - struct bvec_iter iter; - struct bvec_iter end_iter; - unsigned int resid; - unsigned int voff; - unsigned short vcnt; struct bio *bio; - /* Handle the easy case for the caller */ - - if (!offset len == bio_src-bi_iter.bi_size) - return bio_clone(bio_src, gfpmask); - - if (WARN_ON_ONCE(!len)) - return NULL; - if (WARN_ON_ONCE(len bio_src-bi_iter.bi_size)) - return NULL; - if (WARN_ON_ONCE(offset bio_src-bi_iter.bi_size - len)) - return NULL; - - /* Find first affected segment... */ - - resid = offset; - bio_for_each_segment(bv, bio_src, iter) { - if (resid bv.bv_len) - break; - resid -= bv.bv_len; - } - voff = resid; - - /* ...and the last affected segment */ - - resid += len; - __bio_for_each_segment(bv, bio_src, end_iter, iter) { - if (resid = bv.bv_len) - break; - resid -= bv.bv_len; - } - vcnt = end_iter.bi_idx = iter.bi_idx + 1; - - /* Build the clone */ - - bio = bio_alloc(gfpmask, (unsigned int) vcnt); + bio = bio_clone(bio_src, gfpmask); if (!bio) return NULL;/* ENOMEM */ - bio-bi_bdev = bio_src-bi_bdev; - bio-bi_iter.bi_sector = bio_src-bi_iter.bi_sector + - (offset SECTOR_SHIFT); - bio-bi_rw = bio_src-bi_rw; - bio-bi_flags |= 1 BIO_CLONED; - - /* -* Copy over our part of the bio_vec, then update the first -* and last (or only) entries. -*/ - memcpy(bio-bi_io_vec[0], bio_src-bi_io_vec[iter.bi_idx], - vcnt * sizeof (struct bio_vec)); - bio-bi_io_vec[0].bv_offset += voff; - if (vcnt 1) { - bio-bi_io_vec[0].bv_len -= voff; - bio-bi_io_vec[vcnt - 1].bv_len = resid; - } else { - bio-bi_io_vec[0].bv_len = len; - } - - bio-bi_vcnt = vcnt; + bio_advance(bio, offset); bio-bi_iter.bi_size = len; return bio; -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 20/25] block: Don't save/copy bvec array anymore
Now that drivers have been converted to the bvec_iter primitives, they shouldn't be modifying the biovec anymore and thus saving it is unnecessary - code that was previously making a backup of the bvec array can now just save bio-bi_iter. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- fs/bio.c | 54 +- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index 6e42b68..9cff939 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -967,60 +967,33 @@ void bio_copy_data(struct bio *dst, struct bio *src) EXPORT_SYMBOL(bio_copy_data); struct bio_map_data { - struct bio_vec *iovecs; - struct sg_iovec *sgvecs; int nr_sgvecs; int is_our_pages; + struct sg_iovec sgvecs[]; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, struct sg_iovec *iov, int iov_count, int is_our_pages) { - memcpy(bmd-iovecs, bio-bi_io_vec, sizeof(struct bio_vec) * bio-bi_vcnt); memcpy(bmd-sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd-nr_sgvecs = iov_count; bmd-is_our_pages = is_our_pages; bio-bi_private = bmd; } -static void bio_free_map_data(struct bio_map_data *bmd) -{ - kfree(bmd-iovecs); - kfree(bmd-sgvecs); - kfree(bmd); -} - static struct bio_map_data *bio_alloc_map_data(int nr_segs, unsigned int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd; - if (iov_count UIO_MAXIOV) return NULL; - bmd = kmalloc(sizeof(*bmd), gfp_mask); - if (!bmd) - return NULL; - - bmd-iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); - if (!bmd-iovecs) { - kfree(bmd); - return NULL; - } - - bmd-sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); - if (bmd-sgvecs) - return bmd; - - kfree(bmd-iovecs); - kfree(bmd); - return NULL; + return kmalloc(sizeof(struct bio_map_data) + + sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -1030,7 +1003,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec-bv_page); - unsigned int bv_len = iovecs[i].bv_len; + unsigned int bv_len = bvec-bv_len; while (bv_len iov_idx iov_count) { unsigned int bytes; @@ -1090,14 +1063,14 @@ int bio_uncopy_user(struct bio *bio) * don't copy into a random user address space, just free. */ if (current-mm) - ret = __bio_copy_iov(bio, bmd-iovecs, bmd-sgvecs, -bmd-nr_sgvecs, bio_data_dir(bio) == READ, + ret = __bio_copy_iov(bio, bmd-sgvecs, bmd-nr_sgvecs, +bio_data_dir(bio) == READ, 0, bmd-is_our_pages); else if (bmd-is_our_pages) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec-bv_page); } - bio_free_map_data(bmd); + kfree(bmd); bio_put(bio); return ret; } @@ -1211,7 +1184,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, */ if ((!write_to_vm (!map_data || !map_data-null_mapped)) || (map_data map_data-from_user)) { - ret = __bio_copy_iov(bio, bio-bi_io_vec, iov, iov_count, 0, 1, 0); + ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0); if (ret) goto cleanup; } @@ -1225,7 +1198,7 @@ cleanup: bio_put(bio); out_bmd: - bio_free_map_data(bmd); + kfree(bmd); return ERR_PTR(ret); } @@ -1542,16 +1515,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec-bv_page); - int len = bmd-iovecs[i].bv_len; if (read) - memcpy(p, addr, len); + memcpy(p, addr, bvec-bv_len); __free_page(bvec-bv_page); - p += len; + p += bvec-bv_len; } - bio_free_map_data(bmd); + kfree(bmd); bio_put
[PATCH 24/25] block: Introduce new bio_split()
The new bio_split() can split arbitrary bios - it's not restricted to single page bios, like the old bio_split() (previously renamed to bio_pair_split()). It also has different semantics - it doesn't allocate a struct bio_pair, leaving it up to the caller to handle completions. Then convert the existing bio_pair_split() users to the new bio_split() - and also nvme, which was open coding bio splitting. (We have to take that BUG_ON() out of bio_integrity_trim() because this bio_split() needs to use it, and there's no reason it has to be used on bios marked as cloned; BIO_CLONED doesn't seem to have clearly documented semantics anyways.) Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Martin K. Petersen martin.peter...@oracle.com Cc: Matthew Wilcox matthew.r.wil...@intel.com Cc: Keith Busch keith.bu...@intel.com Cc: Vishal Verma vishal.l.ve...@intel.com Cc: Jiri Kosina jkos...@suse.cz Cc: Neil Brown ne...@suse.de --- drivers/block/nvme-core.c | 106 +++--- drivers/block/pktcdvd.c | 136 drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/io.c | 82 +- drivers/md/bcache/request.c | 12 ++-- drivers/md/linear.c | 96 +++ drivers/md/raid0.c | 77 + drivers/md/raid10.c | 113 +++- fs/bio.c| 36 include/linux/bio.h | 22 +++ 10 files changed, 272 insertions(+), 409 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 5539d29..1f14ac4 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -441,104 +441,19 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd, return total_len; } -struct nvme_bio_pair { - struct bio b1, b2, *parent; - struct bio_vec *bv1, *bv2; - int err; - atomic_t cnt; -}; - -static void nvme_bio_pair_endio(struct bio *bio, int err) -{ - struct nvme_bio_pair *bp = bio-bi_private; - - if (err) - bp-err = err; - - if (atomic_dec_and_test(bp-cnt)) { - bio_endio(bp-parent, bp-err); - kfree(bp-bv1); - kfree(bp-bv2); - kfree(bp); - } -} - -static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx, - int len, int offset) -{ - struct nvme_bio_pair *bp; - - BUG_ON(len bio-bi_iter.bi_size); - BUG_ON(idx bio-bi_vcnt); - - bp = kmalloc(sizeof(*bp), GFP_ATOMIC); - if (!bp) - return NULL; - bp-err = 0; - - bp-b1 = *bio; - bp-b2 = *bio; - - bp-b1.bi_iter.bi_size = len; - bp-b2.bi_iter.bi_size -= len; - bp-b1.bi_vcnt = idx; - bp-b2.bi_iter.bi_idx = idx; - bp-b2.bi_iter.bi_sector += len 9; - - if (offset) { - bp-bv1 = kmalloc(bio-bi_max_vecs * sizeof(struct bio_vec), - GFP_ATOMIC); - if (!bp-bv1) - goto split_fail_1; - - bp-bv2 = kmalloc(bio-bi_max_vecs * sizeof(struct bio_vec), - GFP_ATOMIC); - if (!bp-bv2) - goto split_fail_2; - - memcpy(bp-bv1, bio-bi_io_vec, - bio-bi_max_vecs * sizeof(struct bio_vec)); - memcpy(bp-bv2, bio-bi_io_vec, - bio-bi_max_vecs * sizeof(struct bio_vec)); - - bp-b1.bi_io_vec = bp-bv1; - bp-b2.bi_io_vec = bp-bv2; - bp-b2.bi_io_vec[idx].bv_offset += offset; - bp-b2.bi_io_vec[idx].bv_len -= offset; - bp-b1.bi_io_vec[idx].bv_len = offset; - bp-b1.bi_vcnt++; - } else - bp-bv1 = bp-bv2 = NULL; - - bp-b1.bi_private = bp; - bp-b2.bi_private = bp; - - bp-b1.bi_end_io = nvme_bio_pair_endio; - bp-b2.bi_end_io = nvme_bio_pair_endio; - - bp-parent = bio; - atomic_set(bp-cnt, 2); - - return bp; - - split_fail_2: - kfree(bp-bv1); - split_fail_1: - kfree(bp); - return NULL; -} - static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, - int idx, int len, int offset) +int len) { - struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset); - if (!bp) + struct bio *split = bio_split(bio, len 9, GFP_ATOMIC, NULL); + if (!split) return -ENOMEM; + bio_chain(split, bio); + if (bio_list_empty(nvmeq-sq_cong)) add_wait_queue(nvmeq-sq_full, nvmeq-sq_cong_wait); - bio_list_add(nvmeq-sq_cong, bp-b1
[PATCH 19/25] dm: Refactor for new bio cloning/splitting
We need to convert the dm code to the new bvec_iter primitives which respect bi_bvec_done; they also allow us to drastically simplify dm's bio splitting code. Also, it's no longer necessary to save/restore the bvec array anymore - driver conversions for immutable bvecs are done, so drivers should never be modifying it. Also kill bio_sector_offset(), dm was the only user and it doesn't make much sense anymore. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Alasdair Kergon a...@redhat.com Cc: dm-de...@redhat.com Reviewed-by: Mike Snitzer snit...@redhat.com --- drivers/md/dm-bio-record.h | 25 --- drivers/md/dm.c| 174 ++--- fs/bio.c | 72 --- include/linux/bio.h| 2 - 4 files changed, 20 insertions(+), 253 deletions(-) diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 4f46e8e..dd36461 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -17,49 +17,24 @@ * original bio state. */ -struct dm_bio_vec_details { -#if PAGE_SIZE 65536 - __u16 bv_len; - __u16 bv_offset; -#else - unsigned bv_len; - unsigned bv_offset; -#endif -}; - struct dm_bio_details { struct block_device *bi_bdev; unsigned long bi_flags; struct bvec_iter bi_iter; - struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES]; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - unsigned i; - bd-bi_bdev = bio-bi_bdev; bd-bi_flags = bio-bi_flags; bd-bi_iter = bio-bi_iter; - - for (i = 0; i bio-bi_vcnt; i++) { - bd-bi_io_vec[i].bv_len = bio-bi_io_vec[i].bv_len; - bd-bi_io_vec[i].bv_offset = bio-bi_io_vec[i].bv_offset; - } } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - unsigned i; - bio-bi_bdev = bd-bi_bdev; bio-bi_flags = bd-bi_flags; bio-bi_iter = bd-bi_iter; - - for (i = 0; i bio-bi_vcnt; i++) { - bio-bi_io_vec[i].bv_len = bd-bi_io_vec[i].bv_len; - bio-bi_io_vec[i].bv_offset = bd-bi_io_vec[i].bv_offset; - } } #endif diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ccd064e..44a2fa6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1155,7 +1155,6 @@ struct clone_info { struct dm_io *io; sector_t sector; sector_t sector_count; - unsigned short idx; }; static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) @@ -1164,68 +1163,24 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) bio-bi_iter.bi_size = to_bytes(len); } -static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) -{ - bio-bi_iter.bi_idx = idx; - bio-bi_vcnt = idx + bv_count; - bio-bi_flags = ~(1 BIO_SEG_VALID); -} - -static void clone_bio_integrity(struct bio *bio, struct bio *clone, - unsigned short idx, unsigned len, unsigned offset, - unsigned trim) -{ - if (!bio_integrity(bio)) - return; - - bio_integrity_clone(clone, bio, GFP_NOIO); - - if (trim) - bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); -} - -/* - * Creates a little bio that just does part of a bvec. - */ -static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned offset, unsigned len) -{ - struct bio *clone = tio-clone; - struct bio_vec *bv = bio-bi_io_vec + idx; - - *clone-bi_io_vec = *bv; - - bio_setup_sector(clone, sector, len); - - clone-bi_bdev = bio-bi_bdev; - clone-bi_rw = bio-bi_rw; - clone-bi_vcnt = 1; - clone-bi_io_vec-bv_offset = offset; - clone-bi_io_vec-bv_len = clone-bi_iter.bi_size; - clone-bi_flags |= 1 BIO_CLONED; - - clone_bio_integrity(bio, clone, idx, len, offset, 1); -} - /* * Creates a bio that consists of range of complete bvecs. */ static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned short bv_count, unsigned len) + sector_t sector, unsigned len) { struct bio *clone = tio-clone; - unsigned trim = 0; - __bio_clone(clone, bio); - bio_setup_sector(clone, sector, len); - bio_setup_bv(clone, idx, bv_count); + __bio_clone_fast(clone, bio); + + if (bio_integrity(bio)) + bio_integrity_clone(clone, bio, GFP_NOIO); + + bio_advance(clone, to_bytes(sector - clone-bi_iter.bi_sector)); + clone-bi_iter.bi_size = to_bytes(len); - if (idx != bio-bi_iter.bi_idx || - clone-bi_iter.bi_size
[PATCH 22/25] block: Generic bio chaining
This adds a generic mechanism for chaining bio completions. This is going to be used for a bio_split() replacement, and it turns out to be very useful in a fair amount of driver code - a fair number of drivers were implementing this in their own roundabout ways, often painfully. Note that this means it's no longer to call bio_endio() more than once on the same bio! This can cause problems for drivers that save/restore bi_end_io. Arguably they shouldn't be saving/restoring bi_end_io at all - in all but the simplest cases they'd be better off just cloning the bio, and immutable biovecs is making bio cloning cheaper. But for now, we add a bio_endio_nodec() for these cases. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- drivers/md/bcache/io.c | 2 +- drivers/md/dm-cache-target.c | 6 drivers/md/dm-snap.c | 1 + drivers/md/dm-thin.c | 8 +++-- drivers/md/dm-verity.c | 2 +- fs/bio-integrity.c | 2 +- fs/bio.c | 76 include/linux/bio.h | 2 ++ include/linux/blk_types.h| 2 ++ 9 files changed, 90 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 0f0ab65..522f957 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -133,7 +133,7 @@ static void bch_bio_submit_split_done(struct closure *cl) s-bio-bi_end_io = s-bi_end_io; s-bio-bi_private = s-bi_private; - bio_endio(s-bio, 0); + bio_endio_nodec(s-bio, 0); closure_debug_destroy(s-cl); mempool_free(s, s-p-bio_split_hook); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 86f9c83..bf3a206 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -765,6 +765,12 @@ static void writethrough_endio(struct bio *bio, int err) dm_unhook_bio(pb-hook_info, bio); + /* +* Must bump bi_remaining to allow bio to complete with +* restored bi_end_io. +*/ + atomic_inc(bio-bi_remaining); + if (err) { bio_endio(bio, err); return; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3ded8c7..80b5cab 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1415,6 +1415,7 @@ out: if (full_bio) { full_bio-bi_end_io = pe-full_bio_end_io; full_bio-bi_private = pe-full_bio_private; + atomic_inc(full_bio-bi_remaining); } free_pending_exception(pe); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a654024..1abb4a2 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -611,8 +611,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { - if (m-bio) + if (m-bio) { m-bio-bi_end_io = m-saved_bi_end_io; + atomic_inc(m-bio-bi_remaining); + } cell_error(m-tc-pool, m-cell); list_del(m-list); mempool_free(m, m-tc-pool-mapping_pool); @@ -626,8 +628,10 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) int r; bio = m-bio; - if (bio) + if (bio) { bio-bi_end_io = m-saved_bi_end_io; + atomic_inc(bio-bi_remaining); + } if (m-err) { cell_error(pool, m-cell); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index ac35e95..796007a 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -385,7 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error) bio-bi_end_io = io-orig_bi_end_io; bio-bi_private = io-orig_bi_private; - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static void verity_work(struct work_struct *w) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index fed744b..9d547d2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -502,7 +502,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) /* Restore original bio completion handler */ bio-bi_end_io = bip-bip_end_io; - bio_endio(bio, error); + bio_endio_nodec(bio, error); } /** diff --git a/fs/bio.c b/fs/bio.c index e6dfa06..b0a16db 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -273,6 +273,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio-bi_flags = 1 BIO_UPTODATE; + atomic_set(bio-bi_remaining, 1); atomic_set(bio-bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -295,9 +296,35 @@ void bio_reset(struct bio *bio) memset(bio, 0, BIO_RESET_BYTES); bio-bi_flags = flags|(1 BIO_UPTODATE); + atomic_set(bio-bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); +static void bio_chain_endio(struct bio *bio, int error) +{ + bio_endio(bio
[PATCH 23/25] block: Rename bio_split() - bio_pair_split()
This is prep work for introducing a more general bio_split(). Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: NeilBrown ne...@suse.de Cc: Alasdair Kergon a...@redhat.com Cc: Lars Ellenberg lars.ellenb...@linbit.com Cc: Peter Osterlund pete...@telia.com Cc: Sage Weil s...@inktank.com --- drivers/block/pktcdvd.c | 2 +- drivers/md/linear.c | 2 +- drivers/md/raid0.c | 6 +++--- drivers/md/raid10.c | 2 +- fs/bio.c| 4 ++-- include/linux/bio.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ce986ba..28789b8 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2399,7 +2399,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) if (last_zone != zone) { BUG_ON(last_zone != zone + pd-settings.size); first_sectors = last_zone - bio-bi_iter.bi_sector; - bp = bio_split(bio, first_sectors); + bp = bio_pair_split(bio, first_sectors); BUG_ON(!bp); pkt_make_request(q, bp-bio1); pkt_make_request(q, bp-bio2); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fb3b0d0..e9b53e9 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) rcu_read_unlock(); - bp = bio_split(bio, end_sector - bio-bi_iter.bi_sector); + bp = bio_pair_split(bio, end_sector - bio-bi_iter.bi_sector); linear_make_request(mddev, bp-bio1); linear_make_request(mddev, bp-bio2); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 8ee1a6c..ea754dd 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -534,11 +534,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) * refuse to split for us, so we need to split it. */ if (likely(is_power_of_2(chunk_sects))) - bp = bio_split(bio, chunk_sects - (sector + bp = bio_pair_split(bio, chunk_sects - (sector (chunk_sects-1))); else - bp = bio_split(bio, chunk_sects - - sector_div(sector, chunk_sects)); + bp = bio_pair_split(bio, chunk_sects - + sector_div(sector, chunk_sects)); raid0_make_request(mddev, bp-bio1); raid0_make_request(mddev, bp-bio2); bio_pair_release(bp); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ac4bfa438c..69c1bc8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1193,7 +1193,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. */ - bp = bio_split(bio, chunk_sects - + bp = bio_pair_split(bio, chunk_sects - (bio-bi_iter.bi_sector (chunk_sects - 1))); /* Each of these 'make_request' calls will call 'wait_barrier'. diff --git a/fs/bio.c b/fs/bio.c index b0a16db..a3e753f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1827,7 +1827,7 @@ static void bio_pair_end_2(struct bio *bi, int err) /* * split a bio - only worry about a bio with a single page in its iovec */ -struct bio_pair *bio_split(struct bio *bi, int first_sectors) +struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors) { struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); @@ -1874,7 +1874,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) return bp; } -EXPORT_SYMBOL(bio_split); +EXPORT_SYMBOL(bio_pair_split); /** * bio_trim - trim a bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 64f5169..aa67af0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -317,7 +317,7 @@ struct bio_pair { atomic_tcnt; int error; }; -extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); +extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); extern void bio_trim(struct bio *bio, int offset, int size); -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 17/25] block: Add bio_clone_fast()
bio_clone() just got more expensive - however, most users of bio_clone() don't actually need to modify the biovec. If they aren't modifying the biovec, and they can guarantee that the original bio isn't freed before the clone (also true in most cases), we can just point the clone at the original bio's biovec. Signed-off-by: Kent Overstreet k...@daterainc.com --- drivers/md/bcache/request.c | 8 ++ fs/bio.c| 60 + include/linux/bio.h | 2 ++ 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4c0a422..63451c7 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -613,7 +613,6 @@ struct search { struct btree_op op; struct data_insert_op iop; - struct bio_vec bv[BIO_MAX_PAGES]; }; static void bch_cache_read_endio(struct bio *bio, int error) @@ -761,9 +760,7 @@ static void do_bio_hook(struct search *s) struct bio *bio = s-bio.bio; bio_init(bio); - bio-bi_io_vec = s-bv; - bio-bi_max_vecs= BIO_MAX_PAGES; - __bio_clone(bio, s-orig_bio); + __bio_clone_fast(bio, s-orig_bio); bio-bi_end_io = request_endio; bio-bi_private = s-cl; @@ -1065,8 +1062,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) closure_bio_submit(flush, cl, s-d); } } else { - s-iop.bio = bio_clone_bioset(bio, GFP_NOIO, - dc-disk.bio_split); + s-iop.bio = bio_clone_fast(bio, GFP_NOIO, dc-disk.bio_split); closure_bio_submit(bio, cl, s-d); } diff --git a/fs/bio.c b/fs/bio.c index 1628917..00dc189 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -549,6 +549,66 @@ void __bio_clone(struct bio *bio, struct bio *bio_src) EXPORT_SYMBOL(__bio_clone); /** + * __bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: destination bio + * @bio_src: bio to clone + * + * Clone a bio. Caller will own the returned bio, but not + * the actual data it points to. Reference count of returned + * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. + */ +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +{ + BUG_ON(bio-bi_pool BIO_POOL_IDX(bio) != BIO_POOL_NONE); + + /* +* most users will be overriding -bi_bdev with a new target, +* so we don't set nor calculate new physical/hw segment counts here +*/ + bio-bi_bdev = bio_src-bi_bdev; + bio-bi_flags |= 1 BIO_CLONED; + bio-bi_rw = bio_src-bi_rw; + bio-bi_iter = bio_src-bi_iter; + bio-bi_io_vec = bio_src-bi_io_vec; +} +EXPORT_SYMBOL(__bio_clone_fast); + +/** + * bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Like __bio_clone_fast, only also allocates the returned bio + */ +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +{ + struct bio *b; + + b = bio_alloc_bioset(gfp_mask, 0, bs); + if (!b) + return NULL; + + __bio_clone_fast(b, bio); + + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask); + + if (ret 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL(bio_clone_fast); + +/** * bio_clone_bioset - clone a bio * @bio_src: bio to clone * @gfp_mask: allocation priority diff --git a/include/linux/bio.h b/include/linux/bio.h index 1a31f9d..1f83f4a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -328,6 +328,8 @@ extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); +extern void __bio_clone_fast(struct bio *, struct bio *); +extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern void __bio_clone(struct bio *, struct bio *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 25/25] block: Kill bio_pair_split()
Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- fs/bio-integrity.c | 45 --- fs/bio.c| 90 - include/linux/bio.h | 30 -- 3 files changed, 165 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9d547d2..80d972d 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -581,51 +581,6 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset, EXPORT_SYMBOL(bio_integrity_trim); /** - * bio_integrity_split - Split integrity metadata - * @bio: Protected bio - * @bp:Resulting bio_pair - * @sectors: Offset - * - * Description: Splits an integrity page into a bio_pair. - */ -void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) -{ - struct blk_integrity *bi; - struct bio_integrity_payload *bip = bio-bi_integrity; - unsigned int nr_sectors; - - if (bio_integrity(bio) == 0) - return; - - bi = bdev_get_integrity(bio-bi_bdev); - BUG_ON(bi == NULL); - BUG_ON(bip-bip_vcnt != 1); - - nr_sectors = bio_integrity_hw_sectors(bi, sectors); - - bp-bio1.bi_integrity = bp-bip1; - bp-bio2.bi_integrity = bp-bip2; - - bp-iv1 = bip-bip_vec[bip-bip_iter.bi_idx]; - bp-iv2 = bip-bip_vec[bip-bip_iter.bi_idx]; - - bp-bip1.bip_vec = bp-iv1; - bp-bip2.bip_vec = bp-iv2; - - bp-iv1.bv_len = sectors * bi-tuple_size; - bp-iv2.bv_offset += sectors * bi-tuple_size; - bp-iv2.bv_len -= sectors * bi-tuple_size; - - bp-bip1.bip_iter.bi_sector = bio-bi_integrity-bip_iter.bi_sector; - bp-bip2.bip_iter.bi_sector = - bio-bi_integrity-bip_iter.bi_sector + nr_sectors; - - bp-bip1.bip_vcnt = bp-bip2.bip_vcnt = 1; - bp-bip1.bip_iter.bi_idx = bp-bip2.bip_iter.bi_idx = 0; -} -EXPORT_SYMBOL(bio_integrity_split); - -/** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio diff --git a/fs/bio.c b/fs/bio.c index 7b062be..75c49a3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -38,8 +38,6 @@ */ #define BIO_INLINE_VECS4 -static mempool_t *bio_split_pool __read_mostly; - /* * if you change this list, also change bvec_alloc or things will * break badly! cannot be bigger than what you can fit into an @@ -1829,89 +1827,6 @@ struct bio *bio_split(struct bio *bio, int sectors, } EXPORT_SYMBOL(bio_split); -void bio_pair_release(struct bio_pair *bp) -{ - if (atomic_dec_and_test(bp-cnt)) { - struct bio *master = bp-bio1.bi_private; - - bio_endio(master, bp-error); - mempool_free(bp, bp-bio2.bi_private); - } -} -EXPORT_SYMBOL(bio_pair_release); - -static void bio_pair_end_1(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); - - if (err) - bp-error = err; - - bio_pair_release(bp); -} - -static void bio_pair_end_2(struct bio *bi, int err) -{ - struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); - - if (err) - bp-error = err; - - bio_pair_release(bp); -} - -/* - * split a bio - only worry about a bio with a single page in its iovec - */ -struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors) -{ - struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); - - if (!bp) - return bp; - - trace_block_split(bdev_get_queue(bi-bi_bdev), bi, - bi-bi_iter.bi_sector + first_sectors); - - BUG_ON(bio_multiple_segments(bi)); - atomic_set(bp-cnt, 3); - bp-error = 0; - bp-bio1 = *bi; - bp-bio2 = *bi; - bp-bio2.bi_iter.bi_sector += first_sectors; - bp-bio2.bi_iter.bi_size -= first_sectors 9; - bp-bio1.bi_iter.bi_size = first_sectors 9; - - if (bi-bi_vcnt != 0) { - bp-bv1 = bio_iovec(bi); - bp-bv2 = bio_iovec(bi); - - if (bio_is_rw(bi)) { - bp-bv2.bv_offset += first_sectors 9; - bp-bv2.bv_len -= first_sectors 9; - bp-bv1.bv_len = first_sectors 9; - } - - bp-bio1.bi_io_vec = bp-bv1; - bp-bio2.bi_io_vec = bp-bv2; - - bp-bio1.bi_max_vecs = 1; - bp-bio2.bi_max_vecs = 1; - } - - bp-bio1.bi_end_io = bio_pair_end_1; - bp-bio2.bi_end_io = bio_pair_end_2; - - bp-bio1.bi_private = bi; - bp-bio2.bi_private = bio_split_pool; - - if (bio_integrity(bi)) - bio_integrity_split(bi, bp, first_sectors); - - return bp; -} -EXPORT_SYMBOL(bio_pair_split); - /** * bio_trim - trim a bio * @bio: bio to trim @@ -2113,11 +2028,6 @@ static int __init init_bio(void) if (bioset_integrity_create(fs_bio_set
[PATCH 21/25] block: Remove bi_idx hacks
Now that drivers have been converted to the new bvec_iter primitives, there's no need to trim the bvec before we submit it; and we can't trim it once we start sharing bvecs. It used to be that passing a partially completed bio (i.e. one with nonzero bi_idx) to generic_make_request() was a dangerous thing - various drivers would choke on such things. But with immutable biovecs and our new bio splitting that shares the biovecs, submitting partially completed bios has to work (and should work, now that all the drivers have been completed to the new primitives) Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- drivers/md/bcache/io.c | 47 ++- fs/bio.c | 23 --- 2 files changed, 2 insertions(+), 68 deletions(-) diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 6e04f3b..0f0ab65 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -11,49 +11,6 @@ #include linux/blkdev.h -static void bch_bi_idx_hack_endio(struct bio *bio, int error) -{ - struct bio *p = bio-bi_private; - - bio_endio(p, error); - bio_put(bio); -} - -static void bch_generic_make_request_hack(struct bio *bio) -{ - if (bio-bi_iter.bi_idx) { - struct bio_vec bv; - struct bvec_iter iter; - unsigned segs = bio_segments(bio); - struct bio *clone = bio_alloc(GFP_NOIO, segs); - - bio_for_each_segment(bv, bio, iter) - clone-bi_io_vec[clone-bi_vcnt++] = bv; - - clone-bi_iter.bi_sector = bio-bi_iter.bi_sector; - clone-bi_bdev = bio-bi_bdev; - clone-bi_rw= bio-bi_rw; - clone-bi_vcnt = segs; - clone-bi_iter.bi_size = bio-bi_iter.bi_size; - - clone-bi_private = bio; - clone-bi_end_io= bch_bi_idx_hack_endio; - - bio = clone; - } - - /* -* Hack, since drivers that clone bios clone up to bi_max_vecs, but our -* bios might have had more than that (before we split them per device -* limitations). -* -* To be taken out once immutable bvec stuff is in. -*/ - bio-bi_max_vecs = bio-bi_vcnt; - - generic_make_request(bio); -} - /** * bch_bio_split - split a bio * @bio: bio to split @@ -222,12 +179,12 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) n-bi_private = s-cl; closure_get(s-cl); - bch_generic_make_request_hack(n); + generic_make_request(n); } while (n != bio); continue_at(s-cl, bch_bio_submit_split_done, NULL); submit: - bch_generic_make_request_hack(bio); + generic_make_request(bio); } /* Bios with headers */ diff --git a/fs/bio.c b/fs/bio.c index 9cff939..e6dfa06 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1822,11 +1822,7 @@ void bio_trim(struct bio *bio, int offset, int size) { /* 'bio' is a cloned bio which we need to trim to match * the given offset and size. -* This requires adjusting bi_sector, bi_size, and bi_io_vec */ - int i; - struct bio_vec *bvec; - int sofar = 0; size = 9; if (offset == 0 size == bio-bi_iter.bi_size) @@ -1837,25 +1833,6 @@ void bio_trim(struct bio *bio, int offset, int size) bio_advance(bio, offset 9); bio-bi_iter.bi_size = size; - - /* avoid any complications with bi_idx being non-zero*/ - if (bio-bi_iter.bi_idx) { - memmove(bio-bi_io_vec, bio-bi_io_vec+bio-bi_iter.bi_idx, - (bio-bi_vcnt - bio-bi_iter.bi_idx) * - sizeof(struct bio_vec)); - bio-bi_vcnt -= bio-bi_iter.bi_idx; - bio-bi_iter.bi_idx = 0; - } - /* Make sure vcnt and last bv are not too big */ - bio_for_each_segment_all(bvec, bio, i) { - if (sofar + bvec-bv_len size) - bvec-bv_len = size - sofar; - if (bvec-bv_len == 0) { - bio-bi_vcnt = i; - break; - } - sofar += bvec-bv_len; - } } EXPORT_SYMBOL_GPL(bio_trim); -- 1.8.4.4 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 14/25] ceph: Convert to immutable biovecs
Now that we've got a mechanism for immutable biovecs - bi_iter.bi_bvec_done - we need to convert drivers to use primitives that respect it instead of using the bvec array directly. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk Cc: Sage Weil s...@inktank.com Cc: ceph-de...@vger.kernel.org --- include/linux/ceph/messenger.h | 4 ++-- net/ceph/messenger.c | 43 +- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7c1420b..091fdb6 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -1,6 +1,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include linux/blk_types.h #include linux/kref.h #include linux/mutex.h #include linux/net.h @@ -119,8 +120,7 @@ struct ceph_msg_data_cursor { #ifdef CONFIG_BLOCK struct {/* bio */ struct bio *bio; /* bio from list */ - unsigned intvector_index; /* vector from bio */ - unsigned intvector_offset; /* bytes from vector */ + struct bvec_iter bvec_iter; }; #endif /* CONFIG_BLOCK */ struct {/* pages */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b..18c039b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -777,13 +777,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, bio = data-bio; BUG_ON(!bio); - BUG_ON(!bio-bi_vcnt); cursor-resid = min(length, data-bio_length); cursor-bio = bio; - cursor-vector_index = 0; - cursor-vector_offset = 0; - cursor-last_piece = length = bio-bi_io_vec[0].bv_len; + cursor-bvec_iter = bio-bi_iter; + cursor-last_piece = + cursor-resid = bio_iter_len(bio, cursor-bvec_iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -792,71 +791,63 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, { struct ceph_msg_data *data = cursor-data; struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(data-type != CEPH_MSG_DATA_BIO); bio = cursor-bio; BUG_ON(!bio); - index = cursor-vector_index; - BUG_ON(index = (unsigned int) bio-bi_vcnt); + bio_vec = bio_iter_iovec(bio, cursor-bvec_iter); - bio_vec = bio-bi_io_vec[index]; - BUG_ON(cursor-vector_offset = bio_vec-bv_len); - *page_offset = (size_t) (bio_vec-bv_offset + cursor-vector_offset); + *page_offset = (size_t) bio_vec.bv_offset; BUG_ON(*page_offset = PAGE_SIZE); if (cursor-last_piece) /* pagelist offset is always 0 */ *length = cursor-resid; else - *length = (size_t) (bio_vec-bv_len - cursor-vector_offset); + *length = (size_t) bio_vec.bv_len; BUG_ON(*length cursor-resid); BUG_ON(*page_offset + *length PAGE_SIZE); - return bio_vec-bv_page; + return bio_vec.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(cursor-data-type != CEPH_MSG_DATA_BIO); bio = cursor-bio; BUG_ON(!bio); - index = cursor-vector_index; - BUG_ON(index = (unsigned int) bio-bi_vcnt); - bio_vec = bio-bi_io_vec[index]; + bio_vec = bio_iter_iovec(bio, cursor-bvec_iter); /* Advance the cursor offset */ BUG_ON(cursor-resid bytes); cursor-resid -= bytes; - cursor-vector_offset += bytes; - if (cursor-vector_offset bio_vec-bv_len) + + bio_advance_iter(bio, cursor-bvec_iter, bytes); + + if (bytes bio_vec.bv_len) return false; /* more bytes to process in this segment */ - BUG_ON(cursor-vector_offset != bio_vec-bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++index == (unsigned int) bio-bi_vcnt) { + if (!cursor-bvec_iter.bi_size) { bio = bio-bi_next; - index = 0; + cursor-bvec_iter = bio-bi_iter; } cursor-bio = bio; - cursor-vector_index = index; - cursor-vector_offset = 0; if (!cursor-last_piece) { BUG_ON(!cursor-resid); BUG_ON(!bio); /* A short read is OK, so use = rather than == */ - if (cursor-resid = bio-bi_io_vec[index].bv_len) + if (cursor-resid = bio_iter_len(bio, cursor
[PATCH 15/25] block: Kill bio_iovec_idx(), __bio_iovec()
bio_iovec_idx() and __bio_iovec() don't have any valid uses anymore - previous users have been converted to bio_iovec_iter() or other methods. __BVEC_END() has to go too - the bvec array can't be used directly for the last biovec because we might only be using the first portion of it, we have to iterate over the bvec array with bio_for_each_segment() which checks against the current value of bi_iter.bi_size. Signed-off-by: Kent Overstreet k...@daterainc.com Cc: Jens Axboe ax...@kernel.dk --- block/blk-merge.c | 13 +++-- include/linux/bio.h | 26 -- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index a1ead90..05c17be 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -86,6 +86,9 @@ EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { + struct bio_vec end_bv, nxt_bv; + struct bvec_iter iter; + if (!blk_queue_cluster(q)) return 0; @@ -96,14 +99,20 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, if (!bio_has_data(bio)) return 1; - if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) + bio_for_each_segment(end_bv, bio, iter) + if (end_bv.bv_len == iter.bi_size) + break; + + nxt_bv = bio_iovec(nxt); + + if (!BIOVEC_PHYS_MERGEABLE(end_bv, nxt_bv)) return 0; /* * bio and nxt are contiguous in memory; check if the queue allows * these two to be merged into one */ - if (BIO_SEG_BOUNDARY(q, bio, nxt)) + if (BIOVEC_SEG_BOUNDARY(q, end_bv, nxt_bv)) return 1; return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index aea9896..1a31f9d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -61,9 +61,6 @@ * various member access, note that bio_data should of course not be used * on highmem page vectors */ -#define bio_iovec_idx(bio, idx)(((bio)-bi_io_vec[(idx)])) -#define __bio_iovec(bio) bio_iovec_idx((bio), (bio)-bi_iter.bi_idx) - #define __bvec_iter_bvec(bvec, iter) ((bvec)[(iter).bi_idx]) #define bvec_iter_page(bvec, iter) \ @@ -162,19 +159,16 @@ static inline void *bio_data(struct bio *bio) * permanent PIO fall back, user is probably better off disabling highmem * I/O completely on that queue (see ide-dma for example) */ -#define __bio_kmap_atomic(bio, idx)\ - (kmap_atomic(bio_iovec_idx((bio), (idx))-bv_page) +\ - bio_iovec_idx((bio), (idx))-bv_offset) +#define __bio_kmap_atomic(bio, iter) \ + (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) + \ + bio_iter_iovec((bio), (iter)).bv_offset) -#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) +#define __bio_kunmap_atomic(addr) kunmap_atomic(addr) /* * merge helpers etc */ -#define __BVEC_END(bio)bio_iovec_idx((bio), (bio)-bi_vcnt - 1) -#define __BVEC_START(bio) bio_iovec_idx((bio), (bio)-bi_iter.bi_idx) - /* Default implementation of BIOVEC_PHYS_MERGEABLE */ #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2)\ ((bvec_to_phys((vec1)) + (vec1)-bv_len) == bvec_to_phys((vec2))) @@ -191,8 +185,6 @@ static inline void *bio_data(struct bio *bio) (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)-bv_len, queue_segment_boundary((q))) -#define BIO_SEG_BOUNDARY(q, b1, b2) \ - BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) #define bio_io_error(bio) bio_endio((bio), -EIO) @@ -201,9 +193,7 @@ static inline void *bio_data(struct bio *bio) * before it got to the driver and the driver won't own all of it */ #define bio_for_each_segment_all(bvl, bio, i) \ - for (i = 0; \ -bvl = bio_iovec_idx((bio), (i)), i (bio)-bi_vcnt; \ -i++) + for (i = 0, bvl = (bio)-bi_io_vec; i (bio)-bi_vcnt; i++, bvl++) static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter *iter, unsigned bytes) @@ -468,15 +458,15 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) } #endif -static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, +static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter, unsigned long *flags) { - return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); + return bvec_kmap_irq(bio_iter_iovec(bio, iter), flags); } #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf