[PATCH 19/22] block: Rename bio_split() - bio_pair_split()

2013-08-07 Thread Kent Overstreet
This is prep work for introducing a more general bio_split().

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: NeilBrown ne...@suse.de
Cc: Alasdair Kergon a...@redhat.com
Cc: Lars Ellenberg lars.ellenb...@linbit.com
Cc: Peter Osterlund pete...@telia.com
Cc: Sage Weil s...@inktank.com
---
 drivers/block/pktcdvd.c | 2 +-
 drivers/md/linear.c | 2 +-
 drivers/md/raid0.c  | 6 +++---
 drivers/md/raid10.c | 2 +-
 fs/bio.c| 4 ++--
 include/linux/bio.h | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f16dfca..c4c8871 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2413,7 +2413,7 @@ static void pkt_make_request(struct request_queue *q, 
struct bio *bio)
if (last_zone != zone) {
BUG_ON(last_zone != zone + pd-settings.size);
first_sectors = last_zone - bio-bi_iter.bi_sector;
-   bp = bio_split(bio, first_sectors);
+   bp = bio_pair_split(bio, first_sectors);
BUG_ON(!bp);
pkt_make_request(q, bp-bio1);
pkt_make_request(q, bp-bio2);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fb3b0d0..e9b53e9 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct 
bio *bio)
 
rcu_read_unlock();
 
-   bp = bio_split(bio, end_sector - bio-bi_iter.bi_sector);
+   bp = bio_pair_split(bio, end_sector - bio-bi_iter.bi_sector);
 
linear_make_request(mddev, bp-bio1);
linear_make_request(mddev, bp-bio2);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8ee1a6c..ea754dd 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -534,11 +534,11 @@ static void raid0_make_request(struct mddev *mddev, 
struct bio *bio)
 * refuse to split for us, so we need to split it.
 */
if (likely(is_power_of_2(chunk_sects)))
-   bp = bio_split(bio, chunk_sects - (sector 
+   bp = bio_pair_split(bio, chunk_sects - (sector 
   (chunk_sects-1)));
else
-   bp = bio_split(bio, chunk_sects -
-  sector_div(sector, chunk_sects));
+   bp = bio_pair_split(bio, chunk_sects -
+   sector_div(sector, chunk_sects));
raid0_make_request(mddev, bp-bio1);
raid0_make_request(mddev, bp-bio2);
bio_pair_release(bp);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2303f59..7e49cbe 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1193,7 +1193,7 @@ static void make_request(struct mddev *mddev, struct bio 
* bio)
/* This is a one page bio that upper layers
 * refuse to split for us, so we need to split it.
 */
-   bp = bio_split(bio, chunk_sects -
+   bp = bio_pair_split(bio, chunk_sects -
   (bio-bi_iter.bi_sector  (chunk_sects - 1)));
 
/* Each of these 'make_request' calls will call 'wait_barrier'.
diff --git a/fs/bio.c b/fs/bio.c
index 7737984..5767f97 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1746,7 +1746,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 /*
  * split a bio - only worry about a bio with a single page in its iovec
  */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
 {
struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
@@ -1793,7 +1793,7 @@ struct bio_pair *bio_split(struct bio *bi, int 
first_sectors)
 
return bp;
 }
-EXPORT_SYMBOL(bio_split);
+EXPORT_SYMBOL(bio_pair_split);
 
 /**
  * bio_trim - trim a bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1b06bcb..bc3fa72 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -298,7 +298,7 @@ struct bio_pair {
atomic_tcnt;
int error;
 };
-extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
+extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 extern void bio_trim(struct bio *bio, int offset, int size);
 
-- 
1.8.4.rc1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/22] block: Kill bio_segments()/bi_vcnt usage

2013-08-07 Thread Kent Overstreet
When we start sharing biovecs, keeping bi_vcnt accurate for splits is
going to be error prone - and unnecessary, if we refactor some code.

So bio_segments() has to go - but most of the existing users just needed
to know if the bio had multiple segments, which is easier - add a
bio_multiple_segments() for them.

(Two of the current uses of bio_segments() are going to go away in a
couple patches, but the current implementation of bio_segments() is
unsafe as soon as we start doing driver conversions for immutable
biovecs - so implement a dumb version for bisectability, it'll go away
in a couple patches)

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Neil Brown ne...@suse.de
Cc: Nagalakshmi Nandigama nagalakshmi.nandig...@lsi.com
Cc: Sreekanth Reddy sreekanth.re...@lsi.com
Cc: James E.J. Bottomley jbottom...@parallels.com
---
 drivers/block/ps3disk.c  |  7 ++---
 drivers/md/bcache/io.c   | 53 ++--
 drivers/md/raid0.c   |  2 +-
 drivers/md/raid10.c  |  2 +-
 drivers/message/fusion/mptsas.c  |  8 ++---
 drivers/scsi/libsas/sas_expander.c   |  8 ++---
 drivers/scsi/mpt2sas/mpt2sas_transport.c | 10 +++---
 drivers/scsi/mpt3sas/mpt3sas_transport.c |  8 ++---
 fs/bio.c |  2 +-
 include/linux/bio.h  | 43 +-
 10 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 464be78..8d1a19c 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -101,10 +101,9 @@ static void ps3disk_scatter_gather(struct 
ps3_storage_device *dev,
 
rq_for_each_segment(bvec, req, iter) {
unsigned long flags;
-   dev_dbg(dev-sbd.core,
-   %s:%u: bio %u: %u segs %u sectors from %lu\n,
-   __func__, __LINE__, i, bio_segments(iter.bio),
-   bio_sectors(iter.bio), iter.bio-bi_iter.bi_sector);
+   dev_dbg(dev-sbd.core, %s:%u: bio %u: %u sectors from %lu\n,
+   __func__, __LINE__, i, bio_sectors(iter.bio),
+   iter.bio-bi_iter.bi_sector);
 
size = bvec-bv_len;
buf = bvec_kmap_irq(bvec, flags);
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 9b5b6a4..6e04f3b 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -24,7 +24,8 @@ static void bch_generic_make_request_hack(struct bio *bio)
if (bio-bi_iter.bi_idx) {
struct bio_vec bv;
struct bvec_iter iter;
-   struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+   unsigned segs = bio_segments(bio);
+   struct bio *clone = bio_alloc(GFP_NOIO, segs);
 
bio_for_each_segment(bv, bio, iter)
clone-bi_io_vec[clone-bi_vcnt++] = bv;
@@ -32,7 +33,7 @@ static void bch_generic_make_request_hack(struct bio *bio)
clone-bi_iter.bi_sector = bio-bi_iter.bi_sector;
clone-bi_bdev  = bio-bi_bdev;
clone-bi_rw= bio-bi_rw;
-   clone-bi_vcnt  = bio_segments(bio);
+   clone-bi_vcnt  = segs;
clone-bi_iter.bi_size  = bio-bi_iter.bi_size;
 
clone-bi_private   = bio;
@@ -133,40 +134,32 @@ out:
 
 static unsigned bch_bio_max_sectors(struct bio *bio)
 {
-   unsigned ret = bio_sectors(bio);
struct request_queue *q = bdev_get_queue(bio-bi_bdev);
-   unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
- queue_max_segments(q));
+   struct bio_vec bv;
+   struct bvec_iter iter;
+   unsigned ret = 0, seg = 0;
 
if (bio-bi_rw  REQ_DISCARD)
-   return min(ret, q-limits.max_discard_sectors);
-
-   if (bio_segments(bio)  max_segments ||
-   q-merge_bvec_fn) {
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   unsigned seg = 0;
-
-   ret = 0;
+   return min(bio_sectors(bio), q-limits.max_discard_sectors);
 
-   bio_for_each_segment(bv, bio, iter) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev= bio-bi_bdev,
-   .bi_sector  = bio-bi_iter.bi_sector,
-   .bi_size= ret  9,
-   .bi_rw  = bio-bi_rw,
-   };
-
-   if (seg == max_segments)
-   break;
+   bio_for_each_segment(bv, bio, iter) {
+   struct bvec_merge_data bvm = {
+   .bi_bdev= bio-bi_bdev,
+   .bi_sector  = bio-bi_iter.bi_sector

Re: [PATCH-v3 1/4] idr: Percpu ida

2013-08-26 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 02:31:57PM -0700, Andrew Morton wrote:
 On Fri, 16 Aug 2013 23:09:06 + Nicholas A. Bellinger 
 n...@linux-iscsi.org wrote:
 
  From: Kent Overstreet k...@daterainc.com
  
  Percpu frontend for allocating ids. With percpu allocation (that works),
  it's impossible to guarantee it will always be possible to allocate all
  nr_tags - typically, some will be stuck on a remote percpu freelist
  where the current job can't get to them.
  
  We do guarantee that it will always be possible to allocate at least
  (nr_tags / 2) tags - this is done by keeping track of which and how many
  cpus have tags on their percpu freelists. On allocation failure if
  enough cpus have tags that there could potentially be (nr_tags / 2) tags
  stuck on remote percpu freelists, we then pick a remote cpu at random to
  steal from.
  
  Note that there's no cpu hotplug notifier - we don't care, because
  steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
  more allocations if we had a notifier - but we'll still meet our
  guarantees and it's absolutely not a correctness issue, so I don't think
  it's worth the extra code.
 
  ...
 
   include/linux/idr.h |   53 +
   lib/idr.c   |  316 
  +--
 
 I don't think this should be in idr.[ch] at all.  It has no
 relationship with the existing code.  Apart from duplicating its
 functionality :(

Well, in the full patch series it does make use of the non-percpu ida.
I'm still hoping to get the ida/idr rewrites in.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH-v3 1/4] idr: Percpu ida

2013-08-26 Thread Kent Overstreet
On Wed, Aug 21, 2013 at 06:25:58PM +, Christoph Lameter wrote:
 On Fri, 16 Aug 2013, Nicholas A. Bellinger wrote:
 
  +   spinlock_t  lock;
 
 Remove the spinlock.

As Andrew noted, the spinlock is needed because of tag stealing. (You
don't think I'd stick a spinlock on a percpu data structure without a
real reason, would you?)

  +   unsignednr_free;
  +   unsignedfreelist[];
  +};
  +
  +static inline void move_tags(unsigned *dst, unsigned *dst_nr,
  +unsigned *src, unsigned *src_nr,
  +unsigned nr)
  +{
  +   *src_nr -= nr;
  +   memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
  +   *dst_nr += nr;
  +}
  +
 
  +static inline unsigned alloc_local_tag(struct percpu_ida *pool,
  +  struct percpu_ida_cpu *tags)
 
 Pass the __percpu offset and not the tags pointer.

Why? It just changes where the this_cpu_ptr

 
  +{
  +   int tag = -ENOSPC;
  +
  +   spin_lock(tags-lock);
 
 Interupts are already disabled. Drop the spinlock.
 
  +   if (tags-nr_free)
  +   tag = tags-freelist[--tags-nr_free];
 
 You can keep this or avoid address calculation through segment prefixes.
 F.e.
 
 if (__this_cpu_read(tags-nrfree) {
   int n = __this_cpu_dec_return(tags-nr_free);
   tag =  __this_cpu_read(tags-freelist[n]);
 }

Can you explain what the point of that change would be? It sounds like
it's preferable to do it that way and avoid this_cpu_ptr() for some
reason, but you're not explaining why.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] Bcache fix for 3.11

2013-08-26 Thread Kent Overstreet
Last writeback fix had a brown paper bag bug in it. The fix for that is
a one liner though:

The following changes since commit 9df79a37eeb7ee89dc56aeed3051be676369359a:

  bcache: Fix a flush/fua performance bug (2013-08-19 15:30:45 -0700)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11

for you to fetch changes up to 174c1412939888ffedc4da02b375bc579af2a93c:

  bcache: Fix a dumb CPU spinning bug in writeback (2013-08-23 16:30:22 -0700)


Kent Overstreet (1):
  bcache: Fix a dumb CPU spinning bug in writeback

 drivers/md/bcache/writeback.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Bcache fixes for stable

2013-08-26 Thread Kent Overstreet
Two fixes that went into 3.11 should've been marked for stable, but I
screwed up and forgot to mark them.

One of them fixes bugs that appear to being hit in the wild, so please
pull these:

The following changes since commit 0a4b6d4ff200a553951f77f765971cb3e4c91ec0:

  Linux 3.10.9 (2013-08-20 15:40:47 -0700)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.10

for you to fetch changes up to a213846fafcdf52153dfebf8b1671498a72b6285:

  bcache: FUA fixes (2013-08-26 14:56:52 -0700)


Kent Overstreet (1):
  bcache: FUA fixes

Kumar Amit Mehta (1):
  md: bcache: io.c: fix a potential NULL pointer dereference

 drivers/md/bcache/btree.c   | 23 +--
 drivers/md/bcache/io.c  |  2 ++
 drivers/md/bcache/journal.c |  2 +-
 drivers/md/bcache/request.c | 13 -
 4 files changed, 36 insertions(+), 4 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Bcache fixes for stable

2013-08-26 Thread Kent Overstreet
On Mon, Aug 26, 2013 at 03:19:57PM -0700, Greg Kroah-Hartman wrote:
 On Mon, Aug 26, 2013 at 03:02:49PM -0700, Kent Overstreet wrote:
  Two fixes that went into 3.11 should've been marked for stable, but I
  screwed up and forgot to mark them.
  
  One of them fixes bugs that appear to being hit in the wild, so please
  pull these:
  
  The following changes since commit 0a4b6d4ff200a553951f77f765971cb3e4c91ec0:
  
Linux 3.10.9 (2013-08-20 15:40:47 -0700)
  
  are available in the git repository at:
  
git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.10
  
  for you to fetch changes up to a213846fafcdf52153dfebf8b1671498a72b6285:
  
bcache: FUA fixes (2013-08-26 14:56:52 -0700)
  
  
  Kent Overstreet (1):
bcache: FUA fixes
  
  Kumar Amit Mehta (1):
md: bcache: io.c: fix a potential NULL pointer dereference
 
 Can I get a hint as to what the git commit ids of these patches are in
 Linus's tree?

5c694129c8db6d89c9be109049a16510b2f70f6d
e49c7c374e7aacd1f04ecbc21d9dbbeeea4a77d6

 Ideally, that's all you need to send me, but if you want to send me a
 git pull, that's fine too, but please use a signed tag, otherwise it's
 just a random tree...

*nod* I need to get some more signatures on my gpg key...

 Also, if you do a git pull, I need the git commit ids in the changelog
 entry that match Linus's tree, so people (like me) can verify things.

Ok, I'll remember that for the future.

 For now, I can edit these two patches and add them in, if you give me
 the git commit ids to verify them.

Ok - I don't care too much either way but there were some slightly
nontrivial conflicts on the second patch, and the one I did backported
has been tested.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-28 Thread Kent Overstreet
-08-26 21:05:27 INFO: task ceph-osd:8798 blocked for more than 120
  seconds.
 
  Stefan
 
  Am 22.08.2013 09:32, schrieb Stefan Priebe - Profihost AG:
 
  great!
 
  Everything seems to work fine now! Except read_dirty always going to
  negative values after a reboot.
 
  Stefan
 
  Am 22.08.2013 08:02, schrieb Kent Overstreet:
 
  On Thu, Aug 22, 2013 at 07:59:04AM +0200, Stefan Priebe wrote:
 
 
  schedule_timeout() is not the same as
  schedule_timeout_interruptible().
 
 
  just search and replace? So i can try on my own.
 
 
  The one in read_dirty(), line ~330
  --
  To unsubscribe from this list: send the line unsubscribe linux-bcache
  in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
  --
  To unsubscribe from this list: send the line unsubscribe linux-bcache in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH-v4 1/6] idr: Percpu ida

2013-08-28 Thread Kent Overstreet
On Tue, Aug 27, 2013 at 01:43:14PM -0700, Andrew Morton wrote:
 On Tue, 27 Aug 2013 20:11:41 + Nicholas A. Bellinger 
 n...@linux-iscsi.org wrote:
 
  Percpu frontend for allocating ids. With percpu allocation (that works),
  it's impossible to guarantee it will always be possible to allocate all
  nr_tags - typically, some will be stuck on a remote percpu freelist
  where the current job can't get to them.
  
  We do guarantee that it will always be possible to allocate at least
  (nr_tags / 2) tags - this is done by keeping track of which and how many
  cpus have tags on their percpu freelists. On allocation failure if
  enough cpus have tags that there could potentially be (nr_tags / 2) tags
  stuck on remote percpu freelists, we then pick a remote cpu at random to
  steal from.
  
  Note that there's no cpu hotplug notifier - we don't care, because
  steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
  more allocations if we had a notifier - but we'll still meet our
  guarantees and it's absolutely not a correctness issue, so I don't think
  it's worth the extra code.
 
 Except for one silly trivial thing, all of my August 20 review comments
 were ignored.  You didn't even bother replying to the email.

Sorry! I remember seeing that email now, but somehow I completely missed
all the review comments after the first one. Mea culpa!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-19 Thread Kent Overstreet
On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote:
 
 Vanilla 3.10.7 + bcache: Fix a writeback performance regression
 
 http://pastebin.com/raw.php?i=LXZk4cMH

Whoops, at first I thought this was the same bug as one I'd already been
chasing down that had been a harmless bug - turns out I didn't look
closely enough at the backtrace.

What happened is background writeback is deadlocking, because for some
reason the workqueue it's running out of is a singlethreaded workqueue,
so as soon as it decides to queue enough writeback bios that it has to
sleep on that semaphore (which often won't happen due to the PD
controller based ratelimiting) - boom, deadlock.

Here's the fixup patch I just tested and am applying:


From 0af68de350e05e43fd093b36dcb0fe8aa838fabf Mon Sep 17 00:00:00 2001
From: Kent Overstreet k...@daterainc.com
Date: Mon, 19 Aug 2013 15:26:22 -0700
Subject: [PATCH] bcache: Fix a writeback deadlock

Signed-off-by: Kent Overstreet k...@daterainc.com

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f88c62e..27ac519 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -351,7 +351,7 @@ static void write_dirty(struct closure *cl)
 
closure_bio_submit(io-bio, cl, io-dc-disk);
 
-   continue_at(cl, write_dirty_finish, dirty_wq);
+   continue_at(cl, write_dirty_finish, system_wq);
 }
 
 static void read_dirty_endio(struct bio *bio, int error)
@@ -371,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
 
closure_bio_submit(io-bio, cl, io-dc-disk);
 
-   continue_at(cl, write_dirty, dirty_wq);
+   continue_at(cl, write_dirty, system_wq);
 }
 
 static void read_dirty(struct closure *cl)
@@ -512,7 +512,7 @@ void bch_writeback_exit(void)
 
 int __init bch_writeback_init(void)
 {
-   dirty_wq = create_singlethread_workqueue(bcache_writeback);
+   dirty_wq = create_workqueue(bcache_writeback);
if (!dirty_wq)
return -ENOMEM;
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida

2013-08-20 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 02:29:56PM -0700, Andrew Morton wrote:
 On Tue, 20 Aug 2013 14:19:06 -0700 Nicholas A. Bellinger 
 n...@linux-iscsi.org wrote:
 
  On Thu, 2013-08-08 at 14:32 +, Christoph Lameter wrote:
   On Wed, 7 Aug 2013, Kent Overstreet wrote:
   
One thing that was bugging me - I was never able to figure out for sure
if smp_processor_id() returns a number in the range [0, nr_cpu_ids), at
least I couldn't find where it was documented - could you tell me if
that's true?
   
   I always assumed that it was in the range 0 ... nr_cpu_ids - 1 and that is
   the assumption under which the kernel code was written. Things would break
   horribly if smp_process_id would return nr_cpu_ids or higher.
   
  
  Hi guys,
  
  Just a heads up that I've put Kent's standalone percpu-ida patch (with
  Christoph's recommend changes) into target-pending/for-next here:
  
  https://git.kernel.org/cgit/linux/kernel/git/nab/target-pending.git/commit/?h=for-nextid=47bd524a5b3eb6429b058b8b562b45329ab2c9e7
  
  I've got a number of target patches that depend on this code for v3.12,
  and a delay on this particular piece would be painful to endure..
  
  Sooo, please yell loudly if there is an objection to percpu-ida merge as
  a completely standalone item, that does not effect any existing ida
  code.
 
 Was hoping that Tejun had time.  I'll take a look...

I think Tejun and I might be at a bit of an impasse with the ida rewrite
itself, but I don't think there were any outstanding objections to the
percpu ida code itself - and this is a standalone version.

I was meaning to ask you Andrew, if you could take a look at the ida
discussion and lend your opinion - I don't think there's any _specific_
technical objections left to my ida code, and it's now on a more
philisophical complexity vs. ... level.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] bcache: Remove use of down/up_read_non_owner()

2013-08-20 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 11:16:02AM -0400, Steven Rostedt wrote:
 
 The down/up_read_non_owner() is a nasty hack in the API of the rwsem
 operations. It was once removed, but then resurrected for use with
 bcache. Not only is the API an abomination to the rwsem API, it also
 prevents bcache from ever being compiled with PREEMPT_RT, as the RT
 kernel requires all rwsems to have an owner.
 
 Instead of keeping the down/up_read_non_owner() around, it is better to
 modify the one user of it and have it do something a bit differently.
 
 From reading the bcache code, the reason for the non_owner usage is
 that a request is made, and writers must wait for that request to
 finish before they can continue. But because the request is completed
 by another task, the rwsem can not be held for read and then released
 on completion.
 
 Instead of abusing the rwsem code for this, I added a refcount
 nr_requests to the cached_dev structure as well as a write_waiters
 wait queue. When a request is to be made, the rwsem is still taken for
 read, but this time with an owner. The refcount is incremented and
 before exiting the function, the rwsem is released.
 
 The writer will then take the rwsem for write, check the refcount, if
 it is not zero, it will release the rwsem, add itself to a wait_event()
 waiting for refcount to become zero, and then try again.

I _really_ disagree with this approach.

I get that there's a problem, but the bcache code REALLY IS USING THE
RWSEM AS A LOCK; the answer isn't to open code the lock!

Apologies to Christoph for getting distracted and not responding when
you started to explain what the issues were for RT. I'm not really
convinced they're that insurmountable (one of the issues was debugging,
which the _non_owner() stuff always handled just fine), but I'll take it
on faith that this usage is incompatible with rwsems + the RT
functionality since I haven't actually had time to dig into it.

So assuming that's the case, IMO the sanest thing to do is make a new
type of lock - rwsem_non_process or somesuch - and use _that_ in
bcache. Not open coding the lock.

It can even live in the bcache code if we want since there currently
wouldn't be any other users, I don't really care. But open coding it?
Come on... makes me wonder what other code in the kernel is open coding
locks because it couldn't release it in the same process context that
took the lock for whatever reason.

Also, nack this patch because increasing the number of atomic ops to
shared cachelines in our fast path. If it does end up being open coded,
I'll make a more efficient version.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida

2013-08-20 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 10:07:42PM -0400, Tejun Heo wrote:
 Hello, Kent.
 
 On Tue, Aug 20, 2013 at 07:01:32PM -0700, Kent Overstreet wrote:
  I think Tejun and I might be at a bit of an impasse with the ida rewrite
  itself, but I don't think there were any outstanding objections to the
  percpu ida code itself - and this is a standalone version.
 
 The percpu ida code can be applied separately from the ida rewrite?

Yes - at the cost of using significantly more memory for the global
freelist

  I was meaning to ask you Andrew, if you could take a look at the ida
  discussion and lend your opinion - I don't think there's any _specific_
  technical objections left to my ida code, and it's now on a more
  philisophical complexity vs. ... level.
 
 Hmmm... the objection was pretty specific - don't depend on high-order
 or vmalloc allocations when it can be easily avoided by using proper
 radix tree.

We only do vmalloc allocations for CONFIG_COMPACTION=n, and then only
when we need to allocate more than almost 1 _billion_ ids from a single
ida (twice than on 32 bit, so never because that gets us just about to
INT_MAX) - and then it's only 32k of vmalloc memory, for the entire ida.

This is with max allocation order of 4 for COMPACTION=y, 2 for
COMPACTION=n. 

All this for a performance improvement of 10x to 50x (or more), for the
ida sizes I measured.

So I could see your point if we were allocating gobs of vmalloc memory,
or high order allocations big enough to realistically be problematic (I
honestly don't think these will be) - but to me, this seems like a
pretty reasonable tradeoff for those performance gains.

(And the performance gains do substantially come from using more
contiguous memory and treating the whole data structure as an array, and
doing less pointer chasing/looping)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] bcache: Remove use of down/up_read_non_owner()

2013-08-20 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 10:41:53PM -0400, Steven Rostedt wrote:
  I get that there's a problem, but the bcache code REALLY IS USING THE
  RWSEM AS A LOCK; the answer isn't to open code the lock!
 
 Actually, it is using it as a semaphore. The problem with Linux
 was that it only had spin locks and semaphores. People would use the
 semaphore as either a completion or a mutex. Luckily, we created both
 of those and replaced 90% of all semaphores with either a mutex or a
 completion.
 
 The rwsem, sorta has the same issue. But it was pretty much always used
 as a read/write sleeping lock. It was not used as a semaphore. But it
 seems that the bcache really does use it as a semaphore here as it
 isn't just a mutex location. It's a special kind of completion, but the
 completion is in a strange way.

I think we're mostly quibbling over semantics here, and it's not that
big a deal - that said, I don't really get your distinction between a
semaphore and a mutex; I'd distinguish them by saying a semaphore can
count an arbitrary number of resources (the number it's initialized to),
and a mutex is always initialized to one - the fact that in the kernel
mutexes must be released by the same process that took them (while
important for PI and debugging) is not fundamental.

rw semaphore never made any sense to me; they're not counting
anything, like regular semaphores. They're just sleepable rw locks.

So it _sounds_ like you're saying bcache is using it as a semaphore,
beacuse it's using it as a lock we don't release in the original
context? in analogy to linux mutexes and semaphores? Not quite sure what
you mean.

  Apologies to Christoph for getting distracted and not responding when
  you started to explain what the issues were for RT. I'm not really
  convinced they're that insurmountable (one of the issues was debugging,
  which the _non_owner() stuff always handled just fine), but I'll take it
  on faith that this usage is incompatible with rwsems + the RT
  functionality since I haven't actually had time to dig into it.
 
 The thing is, RT requires priority inheritance. When a task blocks on a
 rwsem, it has to boost the owner of that rwsem otherwise it risks
 unbounded priority inversion.
 
 I have a hack that actually implements a work around for non_owner, but
 it adds overhead to all locks to do so.

Ok - I'd just be curious why the PI bit can't be factored out into a
wrapper like how the debug stuff is handled with the _non_owner bits,
but I can certainly believe that.

  So assuming that's the case, IMO the sanest thing to do is make a new
  type of lock - rwsem_non_process or somesuch - and use _that_ in
  bcache. Not open coding the lock.
 
 I actually started that, but decided this was the easier patch to send.
 Don't worry, I was expecting this email. No surprises here ;-)
 
 This email was taken from Andrew Morton's play book (I see you Cc'd
 him, I forgot to). It's one of these patches of It's so bad that the
 owner of the code will fix the issue out of fear that this patch may
 make it in.

Ah, I see :)

  It can even live in the bcache code if we want since there currently
  wouldn't be any other users, I don't really care. But open coding it?
  Come on... makes me wonder what other code in the kernel is open coding
  locks because it couldn't release it in the same process context that
  took the lock for whatever reason.
 
 Most cases just have completions. This looks like a case where don't
 do something while transaction is taking place. Which can usually get
 away with a wait event.

Eh, only in sense that it's easy to implement mutexes and rwlocks with
wait_event().

To simplify the algorithm just a bit (only leaving out some
optimizations), bcache is using it to protect a rb tree, which containts
things that are undergoing background writeback.

For foreground writes, we've got to check if the write overlaps with
anything in this rb tree. If so, we force _this_ write to writeback;
background writeback will write stale data to the backing device, but
that's fine because the data in the cache will still be marked as dirty.

To add stuff to this rb tree - i.e. for background writeback to start
flushing some dirty data - it's got to make sure it's not going to
overwrite some in progress foreground writethrough write.

Tracking every outstanding foreground write would be expensive, so
foreground writes take a read lock on the rb tree (both to check it for
anything they overlap with, and for their duration), and background
writeback takes a write lock when it goes to scan for dirty data to add
to the rb tree.

Even if you complain it's not _just_ protecting the rb tree, we're still
using it for mutual exclusion, plain and simple, and that's all a lock
is.


  Also, nack this patch because increasing the number of atomic ops to
  shared cachelines in our fast path. If it does end up being open coded,
  I'll make a more efficient version.
 
 Perhaps I can whip up a struct rwsem_non_owner and have a 

Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida

2013-08-21 Thread Kent Overstreet
On Wed, Aug 21, 2013 at 07:59:41AM -0400, Tejun Heo wrote:
 Hello, Kent.
 
 On Tue, Aug 20, 2013 at 07:31:51PM -0700, Kent Overstreet wrote:
  All this for a performance improvement of 10x to 50x (or more), for the
  ida sizes I measured.
 
 That's misleading, isn't it? 

It's comparing it to the existing version that actually exists, instead
of comparing it to a hypothetical approach that doesn't exist yet. I
don't see how that's misleading.

 We should see large performance
 improvements even without the large pages.  What matters more is the
 leaf node performance for vast majority of cases and an extra radix
 tree layer on top would cover most of whatever is left.  Whether to
 use high order pages or not only affects the extreme niche use cases
 and I don't think going for high order pages to micro optimize those
 extreme use cases is the right trade off.
 
  So I could see your point if we were allocating gobs of vmalloc memory,
  or high order allocations big enough to realistically be problematic (I
  honestly don't think these will be) - but to me, this seems like a
  pretty reasonable tradeoff for those performance gains.
 
 The trade off is made up as the bulk of the performance benefit can be
 gained without resorting to high order allocations.

I'm more and more skeptical that that's true, and it's a given that it
wouldn't be as fast.

  (And the performance gains do substantially come from using more
  contiguous memory and treating the whole data structure as an array, and
  doing less pointer chasing/looping)
 
 I really have hard time buying that.  Let's say you go with single
 page leaf node and an extra single page layer on top.  How many IDs
 are we talking about?  For the cases which are most performance
 sensitive, this doesn't even matter a bit as percpu caching layer
 would be on top anyway.  I really don't think the micro optimization
 is called for at the cost of high order allocations from low level
 tool library.

These micro optimizations mean either less pointer chasing or less
branching in the _common_ case; you'd trade common case performance for
avoiding ever doing higher order allocations (and 2 with COMPACTION=n
and 4 with COMPACTION=y is not particularly high order!).

I don't buy that that's a good tradeoff. If you're convinced radix trees
are the way to go and it can be done without much performance cost, why
not code it up and show us?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-21 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote:
 Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG:
  Am 20.08.2013 00:27, schrieb Kent Overstreet:
  On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote:
 
  Vanilla 3.10.7 + bcache: Fix a writeback performance regression
 
  http://pastebin.com/raw.php?i=LXZk4cMH
 
  Whoops, at first I thought this was the same bug as one I'd already been
  chasing down that had been a harmless bug - turns out I didn't look
  closely enough at the backtrace.
 
  What happened is background writeback is deadlocking, because for some
  reason the workqueue it's running out of is a singlethreaded workqueue,
  so as soon as it decides to queue enough writeback bios that it has to
  sleep on that semaphore (which often won't happen due to the PD
  controller based ratelimiting) - boom, deadlock.
 
  Here's the fixup patch I just tested and am applying:
 
 Oh i'm now seeing very high CPU spikes of kworker... i don't see if i
 remove bcache: Fix a writeback performance regression.

*swears*

I just saw that this morning, but I assumed it was a bug in my stripe
aware scan code that isn't in this branch yet - thanks for letting me
know that's not the case.

I shall work on a fix for the fix...
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Use this_cpu_ptr() for percpu_ida

2013-08-21 Thread Kent Overstreet
On Wed, Aug 21, 2013 at 05:16:50PM -0400, Tejun Heo wrote:
 Hello, Kent.
 
 On Wed, Aug 21, 2013 at 02:09:01PM -0700, Kent Overstreet wrote:
  These micro optimizations mean either less pointer chasing or less
  branching in the _common_ case; you'd trade common case performance for
  avoiding ever doing higher order allocations (and 2 with COMPACTION=n
  and 4 with COMPACTION=y is not particularly high order!).
 
 Order 4 allocation probably isn't as bad as before but it still is a
 lot nastier than single page allocations.  You say doing it the other
 way would harm the common case performance but didn't answer my
 question about the number of IDs being served per page.  How many can
 be served from a single page?  And how many from two layer single page
 configuration?  How are you defining the common case?

With single page allocations:

1  15 bits per page

1  9 pointers per page

So two layers of pointers does get us to 1  33 bits, which is what we
need.

But now, since we need two layers of pointers instead of one, we need
either another pointer deref for a node lookup - _always_, even when
we've got 8 bytes of bits - or we need to branch on the depth of the
tree, which is something we don't have now.

This is extra overhead _no matter the size of the ida_, over my current
approach.

I'm assuming the common case is  one page of bits, based on the usage
I've seen throughout the kernel that's probably way conservative.

In that case, your approach is going to be slower than mine, and there's
no difference in the size of the allocations.

  I don't buy that that's a good tradeoff. If you're convinced radix trees
  are the way to go and it can be done without much performance cost, why
  not code it up and show us?
 
 Well, I'm not the one trying to rewrite ida, so the onus to justify
 the proposed code is primarily on you.  Another thing is that the
 proposed code is *not* using the existing radix tree and instead
 implementing its own simplified radix tree, which *can* be fine but
 the bar to clear is fairly high.  You have to be able to show
 *clearly* that using the existing radix tree is not an option.  Until
 now, the only thing that I gathered is the simplified thing is gonna
 be faster in some extreme cases while having clear disadvantage in
 terms of memory allocation.  Not very convincing.

I've already shown massive performance gains over the existing radix
tree approach, you're the one claiming a different approach would be
better.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-21 Thread Kent Overstreet
On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote:
 Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG:
  Am 20.08.2013 00:27, schrieb Kent Overstreet:
  On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote:
 
  Vanilla 3.10.7 + bcache: Fix a writeback performance regression
 
  http://pastebin.com/raw.php?i=LXZk4cMH
 
  Whoops, at first I thought this was the same bug as one I'd already been
  chasing down that had been a harmless bug - turns out I didn't look
  closely enough at the backtrace.
 
  What happened is background writeback is deadlocking, because for some
  reason the workqueue it's running out of is a singlethreaded workqueue,
  so as soon as it decides to queue enough writeback bios that it has to
  sleep on that semaphore (which often won't happen due to the PD
  controller based ratelimiting) - boom, deadlock.
 
  Here's the fixup patch I just tested and am applying:
 
 Oh i'm now seeing very high CPU spikes of kworker... i don't see if i
 remove bcache: Fix a writeback performance regression.

Are you able to reproduce it? I'm not having any luck reproducing it...
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-21 Thread Kent Overstreet
 [a010c49e]
 request_write+0x19e/0x380 [bcache]
 
 Am 22.08.2013 01:47, schrieb Kent Overstreet:
 On Tue, Aug 20, 2013 at 10:07:45AM +0200, Stefan Priebe - Profihost AG wrote:
 Am 20.08.2013 10:01, schrieb Stefan Priebe - Profihost AG:
 Am 20.08.2013 00:27, schrieb Kent Overstreet:
 On Mon, Aug 19, 2013 at 12:09:24AM +0200, Stefan Priebe wrote:
 
 Vanilla 3.10.7 + bcache: Fix a writeback performance regression
 
 http://pastebin.com/raw.php?i=LXZk4cMH
 
 Whoops, at first I thought this was the same bug as one I'd already been
 chasing down that had been a harmless bug - turns out I didn't look
 closely enough at the backtrace.
 
 What happened is background writeback is deadlocking, because for some
 reason the workqueue it's running out of is a singlethreaded workqueue,
 so as soon as it decides to queue enough writeback bios that it has to
 sleep on that semaphore (which often won't happen due to the PD
 controller based ratelimiting) - boom, deadlock.
 
 Here's the fixup patch I just tested and am applying:
 
 Oh i'm now seeing very high CPU spikes of kworker... i don't see if i
 remove bcache: Fix a writeback performance regression.
 
 Are you able to reproduce it? I'm not having any luck reproducing it...
 --
 To unsubscribe from this list: send the line unsubscribe linux-bcache in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-22 Thread Kent Overstreet
On Thu, Aug 22, 2013 at 07:59:04AM +0200, Stefan Priebe wrote:
 
 schedule_timeout() is not the same as
 schedule_timeout_interruptible().
 
 just search and replace? So i can try on my own.

The one in read_dirty(), line ~330
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache: Fix a writeback performance regression

2013-08-22 Thread Kent Overstreet
On Thu, Aug 22, 2013 at 09:32:52AM +0200, Stefan Priebe - Profihost AG wrote:
 great!
 
 Everything seems to work fine now! Except read_dirty always going to
 negative values after a reboot.

That one's fixed in 3.11 :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 04/22] block: Abstract out bvec iterator

2013-08-13 Thread Kent Overstreet
On Tue, Aug 13, 2013 at 10:03:04AM -0400, Ed Cashin wrote:
 On Aug 9, 2013, Ed Cashin wrote:
  On Aug 8, 2013, at 9:05 PM, Kent Overstreet wrote:
  ...
   It's in the for-jens branch now.
  
  
  Just examining the patches, I like the way it cleans up the aoe code.  I
  had a question about a new BUG added by the for-jens branch the
  read-response handling path of the aoe driver.
 
 The aoe driver in linux-bcache/for-jens commit 4c36c973a8f45 is
 passing my tests.
 
 Here is a patch against that branch illustrating my suggestion for
 handling bad target responses gracefully.

Thanks - shall I just fold that into the aoe immutable bvec patch?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-13 Thread Kent Overstreet
On Fri, Aug 09, 2013 at 10:57:56AM -0400, Tejun Heo wrote:
 Hello,
 
 On Wed, Aug 07, 2013 at 01:51:17PM -0700, Kent Overstreet wrote:
  + * So if the max section size is 64k, that's ~4096 sections, with 8 byte
  + * pointers that's a little over 32k for the pointers to sections.
  + *
  + * That means max size sections are order 4 page allocations.
 
 Order 4 allocations for common data structure doesn't really sound
 like a good idea to me.  It's gonna work fine on relatively large
 machines but suck on mobile / small embedded devices, many of which
 are still struggling with 32bit address space and compaction may not
 be enabled.  It just doens't make sense to me to impose 64k
 allocations from low level library function like ida.

I have a hard time seeing how it could really be an issue in practice -
keep in mind, for every bit in the ida tree we're going to have some
struct allocated somewhere else that the id correspends to.

So for a 4k ida, that's... bare minimum around 128k memory that has to
be allocated somewhere else, and in a single subsystem, assuming 16 byte
structs - and typically it'll be many times that. 32k memory in the ida
- 2 mb in the subsystem, absolute minimum.

If you're convinced this is a real issue though - how about
IDA_SECTION_SIZE conditional on CONFIG_COMPACTION, so we use order 2 or
3 allocations if CONFIG_COMPACTION=n?

Then the max size toplevel array of pointers to segments would be
bigger, but that's only an issue when we're allocating up to near
INT_MAX ids, so it's difficult to see how _that_ would be an issue on a
small/embedded system... and we could even use vmalloc for that
allocation when the size of that array is  IDA_SECTION_SIZE.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-13 Thread Kent Overstreet
On Tue, Aug 13, 2013 at 06:19:28PM -0400, Tejun Heo wrote:
 Hello,
 
 On Tue, Aug 13, 2013 at 03:13:08PM -0700, Kent Overstreet wrote:
  If you're convinced this is a real issue though - how about
 
 It is a real issue.  Large order allocation is fine for optimization
 but shouldn't be depended upon.  It does fail easily without
 compaction and compaction is heavy-ass operation which will blow up
 any minute performance advantage you might get from avoiding proper
 radix tree implementation.
 
  IDA_SECTION_SIZE conditional on CONFIG_COMPACTION, so we use order 2 or
  3 allocations if CONFIG_COMPACTION=n?
  
  Then the max size toplevel array of pointers to segments would be
  bigger, but that's only an issue when we're allocating up to near
  INT_MAX ids, so it's difficult to see how _that_ would be an issue on a
  small/embedded system... and we could even use vmalloc for that
  allocation when the size of that array is  IDA_SECTION_SIZE.
 
 What about cyclic allocations then?  This is natrually a radix tree
 problem.  I don't know why you're resisting radix tree so much here.

It's only naturally a radix tree problem _if_ you require sparseness.
Otherwise, radix trees require pointer chasing, which we can avoid -
which saves us both the cost of chasing pointers (which is significant)
and the overhead of storing them.

The patch handles cyclic allocation by limiting sparseness - we talked
about this and I thought you were ok with this solution, though it was
awhile ago and I could be misremembering your comments.

To recap, here's the code that implements that sparseness limiting, it's
documented in ida_alloc_cyclic()'s docs:

static int __ida_alloc_cyclic(struct ida *ida, unsigned start, unsigned end,
  gfp_t gfp, unsigned long *flags)
__releases(ida-lock)
__acquires(ida-lock)
{
int ret;
unsigned id;

ret = __ida_alloc_range_multiple(ida, id, 1,
 max(start, ida-cur_id),
 end, gfp, flags);

if (ret  0)
ret = __ida_alloc_range_multiple(ida, id, 1, start,
 end, gfp, flags);
if (ret == 1) {
ida-cur_id = id + 1;
if ((ida-cur_id - start) / 2  max(1024U, ida-allocated_ids))
ida-cur_id = 0;

return id;
}

return ret;
}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-13 Thread Kent Overstreet
On Fri, Aug 09, 2013 at 10:57:56AM -0400, Tejun Heo wrote:
 Hello,
 
 On Wed, Aug 07, 2013 at 01:51:17PM -0700, Kent Overstreet wrote:
  + * So if the max section size is 64k, that's ~4096 sections, with 8 byte
  + * pointers that's a little over 32k for the pointers to sections.
  + *
  + * That means max size sections are order 4 page allocations.
 
 Order 4 allocations for common data structure doesn't really sound
 like a good idea to me.  It's gonna work fine on relatively large
 machines but suck on mobile / small embedded devices, many of which
 are still struggling with 32bit address space and compaction may not
 be enabled.  It just doens't make sense to me to impose 64k
 allocations from low level library function like ida.

Would this be an acceptable solution?

From 483cfa0c809b7dc3b0abad93407468f273416578 Mon Sep 17 00:00:00 2001
From: Kent Overstreet k...@daterainc.com
Date: Tue, 13 Aug 2013 15:31:20 -0700
Subject: [PATCH] ida: Use order 2 allocations when COMPACTION=n

And fall back to vmalloc for the array of pointers to sections so we can
still allocate up to INT_MAX ids.

diff --git a/lib/idr.c b/lib/idr.c
index 02a221c..3bffb52 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -21,16 +21,20 @@
 #include linux/export.h
 #include linux/idr.h
 #include linux/kernel.h
+#include linux/mm.h
 #include linux/percpu.h
 #include linux/rcupdate.h
 #include linux/sched.h
 #include linux/slab.h
 #include linux/string.h
 #include linux/spinlock.h
+#include linux/vmalloc.h
 
 static void kgfree(void *ptr, size_t size)
 {
-   if (size  PAGE_SIZE)
+   if (is_vmalloc_addr(ptr))
+   vfree(ptr);
+   else if (size  PAGE_SIZE)
kfree(ptr);
else
free_pages((unsigned long) ptr, get_order(size));
@@ -102,7 +106,14 @@ static void *kgalloc(size_t size, gfp_t gfp)
  */
 
 #define IDA_TREE_ARY   BITS_PER_LONG
-#define IDA_SECTION_SIZE   (64UL  10)
+
+/* Max section size, in bytes */
+#ifdef CONFIG_COMPACTION
+#define IDA_SECTION_SIZE   (64UL  10) /* order 4 page allocation */
+#else
+#define IDA_SECTION_SIZE   (16UL  10) /* order 2 */
+#endif
+
 #define IDA_NODES_PER_SECTION  (IDA_SECTION_SIZE / sizeof(unsigned long))
 
 static inline unsigned long *ida_index_to_node(struct ida *ida, unsigned node)
@@ -251,8 +262,15 @@ again:
 
if (ida-nodes = IDA_NODES_PER_SECTION 
is_power_of_2(cur_sections)) {
-   sections = kgalloc(cur_sections * 2 * sizeof(unsigned long *),
-  __GFP_ZERO|gfp);
+   size_t bytes = cur_sections * 2 * sizeof(unsigned long *);
+
+   if (bytes = IDA_SECTION_SIZE)
+   sections = kgalloc(bytes, __GFP_ZERO|gfp);
+   else if (gfp  GFP_KERNEL)
+   sections = vzalloc(bytes);
+   else
+   sections = NULL;
+
if (!sections)
goto err;
}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-13 Thread Kent Overstreet
On Tue, Aug 13, 2013 at 06:44:28PM -0400, Tejun Heo wrote:
 Hello, Kent.
 
 On Tue, Aug 13, 2013 at 03:27:59PM -0700, Kent Overstreet wrote:
  It's only naturally a radix tree problem _if_ you require sparseness.
 
 Well, it's not necessarily about requiring it but more about surviving
 it with some grace when things don't go as expected, which is an
 important characteristic for common library stuff.

The patch I posted should solve the high order allocations stuff, and
sparseness from cyclic allocations was already solved.

  Otherwise, radix trees require pointer chasing, which we can avoid -
  which saves us both the cost of chasing pointers (which is significant)
  and the overhead of storing them.
 
 Vast majority of which can be avoided with simple caching, right?

Whatever caching optimizations you do with a radix tree version I could
apply to this bitmap tree version, and my bitmap tree code is simpler
and _considerably_ faster than the existing code.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-13 Thread Kent Overstreet
On Tue, Aug 13, 2013 at 07:22:11PM -0400, Tejun Heo wrote:
 Hello,
 
 On Tue, Aug 13, 2013 at 03:59:27PM -0700, Kent Overstreet wrote:
   Well, it's not necessarily about requiring it but more about surviving
   it with some grace when things don't go as expected, which is an
   important characteristic for common library stuff.
  
  The patch I posted should solve the high order allocations stuff, and
  sparseness from cyclic allocations was already solved.
 
 I don't know.  Yeah, using vmalloc would be able to work around the
 issue for most cases, I suppose.  It's iffy to consume vmalloc space
 from ida, which functionally is such a basic algorithmic construct.
 It probably won't worsen things noticeably but vmalloc area can be a
 very precious resource on 32bit configs.

This is only using it for the array of pointers to sections though, not
the bitmap itself - and only when that allocations is  16k. For INT_MAX
allocated ids (absolute worst case) we'd be using 256k of vmalloc memory
on 64 bit, half that on 32 bit.

 
  Whatever caching optimizations you do with a radix tree version I could
  apply to this bitmap tree version, and my bitmap tree code is simpler
  and _considerably_ faster than the existing code.
 
 But the difference won't really matter.  Cached performance would be
 the same and that's likely to cover most cases, right?  It's not like
 radix tree is orders of magnitude slower.

Should probably be almost as good, yeah... in theory, but the space
efficiency still isn't going to be as good, and it'll probably be more
code... and at this point I really just don't want to futz with it more.
At this point unless there's something really wrong with this code I
just want to move onto something else :P
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] bcache fixes for 3.11

2013-08-14 Thread Kent Overstreet
Jens, here's the latest bcache fixes. Some urgent stuff in here:


The following changes since commit 79826c35eb99cd3c0873b8396f45fa26c87fb0b0:

  bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11

for you to fetch changes up to 0434a516e99ee51ac2d0dfa71b2f56c89ac5db05:

  bcache: Fix a flush/fua performance bug (2013-08-14 15:44:51 -0700)


Gabriel de Perthuis (1):
  bcache: Strip endline when writing the label through sysfs

Geert Uytterhoeven (1):
  bcache: Correct printf()-style format length modifier

Kent Overstreet (4):
  bcache: Fix a dumb journal discard bug
  bcache: Fix for when no journal entries are found
  bcache: Fix a writeback performance regression
  bcache: Fix a flush/fua performance bug

 drivers/md/bcache/bcache.h|  7 +++
 drivers/md/bcache/btree.c |  2 +-
 drivers/md/bcache/journal.c   | 33 -
 drivers/md/bcache/sysfs.c |  9 +++--
 drivers/md/bcache/util.c  | 11 ++-
 drivers/md/bcache/util.h  | 12 +---
 drivers/md/bcache/writeback.c | 37 ++---
 7 files changed, 68 insertions(+), 43 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] idr: Document ida tree sections

2013-08-14 Thread Kent Overstreet
On Tue, Aug 13, 2013 at 07:59:47PM -0400, Tejun Heo wrote:
 Hey, Kent.
 
 On Tue, Aug 13, 2013 at 04:51:33PM -0700, Kent Overstreet wrote:
  Should probably be almost as good, yeah... in theory, but the space
  efficiency still isn't going to be as good, and it'll probably be more
  code... and at this point I really just don't want to futz with it more.
  At this point unless there's something really wrong with this code I
  just want to move onto something else :P
 
 I think it probably would be okay in most cases but don't feel
 confident about acking as it's making trade-offs which are unnecessary
 and unusual.  So, ummm, I really don't know.  Maybe it's better enough
 than what we have now but at the same time if you want to reimplement
 the whole thing you should be persistent / reliable enough to see it
 through this time around too, right?

I was just telling you how I felt :) Regardless of that, IMO what I've
got now is superior to any radix tree based approach for what ida/idr
are supposed to do. I could of course be wrong, but I'm not convinced...

Re: caching the last allocation with a radix tree based implementation.
I thought about that more last night, I don't think that would be viable
for using ida underneath the percpu ida allocator.

Reason being percpu ida has to heavily optimize for the case where
almost all of the id space is allocated, and after awhile the id space
is going to be fragmented - so caching the last allocated id is going to
be useless.

This is also why I implemented the ganged allocation bit, to amortize
the bitmap tree traversal. So we'd lose out on that going back to a
radix tree, or have to reimplement it (and it'll be slower due to
pointer chasing).

Which is all not the end of the world, but it means that if we punt on
the ida/idr rewrites for now or change our minds about them - we _do_
have quite a bit of stuff waiting on the percpu ida allocator, so for
that to go in separate I'll have to change it back to using a stack of
integers for the global freelist - which does use significantly more
memory.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [GIT PULL] bcache fixes for 3.11

2013-08-15 Thread Kent Overstreet
On Thu, Aug 15, 2013 at 08:43:22AM +0200, Stefan Priebe - Profihost AG wrote:
 Am 15.08.2013 00:59, schrieb Kent Overstreet:
  Jens, here's the latest bcache fixes. Some urgent stuff in here:
  
  
  The following changes since commit 79826c35eb99cd3c0873b8396f45fa26c87fb0b0:
  
bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700)
  
  are available in the git repository at:
  
git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11
  
  for you to fetch changes up to 0434a516e99ee51ac2d0dfa71b2f56c89ac5db05:
  
bcache: Fix a flush/fua performance bug (2013-08-14 15:44:51 -0700)
  
  
  Gabriel de Perthuis (1):
bcache: Strip endline when writing the label through sysfs
  
  Geert Uytterhoeven (1):
bcache: Correct printf()-style format length modifier
  
  Kent Overstreet (4):
bcache: Fix a dumb journal discard bug
bcache: Fix for when no journal entries are found
bcache: Fix a writeback performance regression
bcache: Fix a flush/fua performance bug
  
   drivers/md/bcache/bcache.h|  7 +++
   drivers/md/bcache/btree.c |  2 +-
   drivers/md/bcache/journal.c   | 33 -
   drivers/md/bcache/sysfs.c |  9 +++--
   drivers/md/bcache/util.c  | 11 ++-
   drivers/md/bcache/util.h  | 12 +---
   drivers/md/bcache/writeback.c | 37 ++---
   7 files changed, 68 insertions(+), 43 deletions(-)
  --
 
 As 3.10 is long term stable release. You might need CC the stable list
 for patches which have to go to 3.10 as well.
 
 At least this one should go to 3.10:
  Gabriel de Perthuis (1):
bcache: Strip endline when writing the label through sysfs

That patch is fixing a bug in code that was merged for 3.11 - the rest
are marked for stable :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/4] idr: Percpu ida

2013-06-18 Thread Kent Overstreet
On Tue, Jun 18, 2013 at 02:14:53PM +, Christoph Lameter wrote:
 On Mon, 17 Jun 2013, Kent Overstreet wrote:
 
  +static inline unsigned alloc_local_tag(struct percpu_ida *pool,
  +  struct percpu_ida_cpu *tags)
  +{
  +   int tag = -ENOSPC;
  +
  +   spin_lock(tags-lock);
  +   if (tags-nr_free)
  +   tag = tags-freelist[--tags-nr_free];
  +   spin_unlock(tags-lock);
  +
  +   return tag;
  +}
 
 This could be made much faster by avoiding real atomics (coming with
 spinlocks) and using per cpu atomics instead. Slub f.e. uses a single
 linked per cpu list managed via this_cpu_cmpxchg.

Actually, we do need the atomic ops - they're protecting against a
different cpu grabbing our freelist with steal_tags().

The alternative (which Andrew Morton suggested and Jens implemented in
his fork of an earlier version of this code) would be to use IPIs, but
there are reasonable use cases (high ratio of cpus:nr_tags) where tag
stealing is going to be reasonably common so we don't want it to suck
too much. Plus, steal_tags() needs to try different cpu's freelists
until it finds one with tags, IPIs would make that loop... problematic.

I actually originally used cmpxchg(), but then later in review I
realized I had an ABA and couldn't figure out how to solve it, so that's
when I switched to spinlocks (which Andrew Morton preferred anyways).

But I just realized a few minutes ago I've seen your solution to ABA
with a stack, in the slub code - double word cmpxchg() with a
transaction id :) I may try that again just to see how the code looks,
but I'm not sure the difference in performance will be big enough to
matter, give that this lock should basically never be contended.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] lib/idr.c rewrite, percpu ida/tag allocator

2013-06-18 Thread Kent Overstreet
This is the second iteration of patches 1-4 - there's only been a few
trivial bugfixes for those.

The rest is the idr rewrite - I reimplemented it on top of the new ida
implementation and the existing radix tree implementation.

Patch series is available in my git repo:
git://evilpiepirate.org/~kent/linux-bcache.git idr

 arch/powerpc/mm/icswx_pid.c   |   36 +-
 arch/powerpc/mm/mmu_context_hash64.c  |   28 +-
 block/blk-core.c  |4 +-
 block/blk-sysfs.c |2 +-
 block/bsg.c   |2 +-
 block/genhd.c |2 +-
 drivers/atm/nicstar.c |4 +-
 drivers/base/platform.c   |6 +-
 drivers/base/soc.c|   18 +-
 drivers/block/drbd/drbd_main.c|6 +-
 drivers/block/drbd/drbd_nl.c  |2 +-
 drivers/block/loop.c  |4 +-
 drivers/block/mtip32xx/mtip32xx.c |   24 +-
 drivers/block/nvme-core.c |   33 +-
 drivers/block/rsxx/core.c |   21 +-
 drivers/block/virtio_blk.c|6 +-
 drivers/dca/dca-sysfs.c   |   18 +-
 drivers/dma/dmaengine.c   |2 +-
 drivers/firewire/core-cdev.c  |5 +-
 drivers/firewire/core-device.c|2 +-
 drivers/gpio/gpiolib.c|2 +-
 drivers/gpu/drm/drm_context.c |2 +-
 drivers/gpu/drm/drm_crtc.c|2 +-
 drivers/gpu/drm/drm_gem.c |8 +-
 drivers/gpu/drm/drm_stub.c|2 +-
 drivers/gpu/drm/exynos/exynos_drm_ipp.c   |2 +-
 drivers/gpu/drm/i915/i915_gem_context.c   |2 +-
 drivers/gpu/drm/qxl/qxl_cmd.c |4 +-
 drivers/gpu/drm/qxl/qxl_drv.h |1 -
 drivers/gpu/drm/qxl/qxl_kms.c |1 -
 drivers/gpu/drm/qxl/qxl_release.c |   19 +-
 drivers/gpu/drm/sis/sis_mm.c  |2 +-
 drivers/gpu/drm/via/via_mm.c  |2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_gmrid_manager.c |   31 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c  |4 +-
 drivers/hwmon/hwmon.c |6 +-
 drivers/hwmon/ibmaem.c|   10 +-
 drivers/i2c/i2c-core.c|4 +-
 drivers/iio/industrialio-core.c   |4 +-
 drivers/iio/industrialio-trigger.c|6 +-
 drivers/infiniband/core/cm.c  |7 +-
 drivers/infiniband/core/cma.c |2 +-
 drivers/infiniband/core/sa_query.c|4 +-
 drivers/infiniband/core/ucm.c |2 +-
 drivers/infiniband/core/ucma.c|4 +-
 drivers/infiniband/core/uverbs_cmd.c  |4 +-
 drivers/infiniband/hw/amso1100/c2.h   |1 -
 drivers/infiniband/hw/amso1100/c2_qp.c|   20 +-
 drivers/infiniband/hw/cxgb3/iwch.h|4 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h|4 +-
 drivers/infiniband/hw/ehca/ehca_cq.c  |4 +-
 drivers/infiniband/hw/ehca/ehca_qp.c  |4 +-
 drivers/infiniband/hw/ipath/ipath_driver.c|4 +-
 drivers/infiniband/hw/mlx4/cm.c   |2 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c|2 +-
 drivers/infiniband/hw/qib/qib_init.c  |4 +-
 drivers/input/input.c |6 +-
 drivers/iommu/iommu.c |   18 +-
 drivers/ipack/ipack.c |4 +-
 drivers/md/dm.c   |   22 +-
 drivers/memstick/core/memstick.c  |   17 +-
 drivers/memstick/core/mspro_block.c   |2 +-
 drivers/mfd/rtsx_pcr.c|   13 +-
 drivers/misc/c2port/core.c|   11 +-
 drivers/misc/cb710/core.c |   19 +-
 drivers/misc/tifm_core.c  |   15 +-
 drivers/mmc/core/host.c   |   13 +-
 drivers/mtd/mtdcore.c |4 +-
 drivers/net/macvtap.c |2 +-
 drivers/net/ppp/ppp_generic.c |4 +-
 drivers/power/bq2415x_charger.c   |2 +-
 drivers/power/bq27x00_battery.c   |2 +-
 drivers/power/ds2782_battery.c|2 +-
 drivers/pps/kapi.c|2 +-
 drivers/pps/pps.c |4 +-
 drivers/ptp/ptp_clock.c   |4 +-
 drivers/remoteproc/remoteproc_core.c  |8 +-
 drivers/rpmsg/virtio_rpmsg_bus.c  |4 +-
 drivers/rtc/class.c   |6 +-
 drivers/scsi/bfa/bfad_im.c|2 +-
 drivers/scsi/ch.c |   14 +-
 drivers/scsi/lpfc/lpfc_init.c |2 +-
 

[PATCH 03/10] idr: Rewrite ida

2013-06-18 Thread Kent Overstreet
This is a new, from scratch implementation of ida that should be
simpler, faster and more space efficient.

Two primary reasons for the rewrite:
 * A future patch will reimplement idr on top of this ida implementation +
   radix trees. Once that's done, the end result will be ~1k fewer lines
   of code, much simpler and easier to understand and it should be quite
   a bit faster.

 * The performance improvements and addition of ganged allocation should
   make ida more suitable for use by a percpu id/tag allocator, which
   would then act as a frontend to this allocator.

The old ida implementation was done with the idr data structures - this
was IMO backwards. I'll soon be reimplementing idr on top of this new
ida implementation and radix trees - using a separate dedicated data
structure for the free ID bitmap should actually make idr faster, and
the end result is _significantly_ less code.

This implementation conceptually isn't that different from the old one -
it's a tree of bitmaps, where one bit in a given node indicates whether
or not there are free bits in a child node.

The main difference (and advantage) over the old version is that the
tree isn't implemented with pointers - it's implemented in an array,
like how heaps are implemented, which both better space efficiency and
it'll be faster since there's no pointer chasing.

This does mean that the entire bitmap is stored in one contiguous memory
allocation - and as currently implemented we won't be able to allocate
_quite_ as many ids as with the previous implementation.

I don't expect this to be an issue in practice since anywhere this is
used, an id corresponds to a struct allocation somewher else - we can't
allocate an unbounded number of ids, we'll run out of memory somewhere
else eventually, and I expect that to be the limiting factor in
practice.

If a user/use case does come up where this matters I can add some
sharding (or perhaps add a separate big_ida implementation) - but the
extra complexity would adversely affect performance for the users that
don't need  millions of ids, so I intend to leave the implementation as
is until if and when this becomes an issue.

Signed-off-by: Kent Overstreet koverstr...@google.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Tejun Heo t...@kernel.org
Cc: Stephen Rothwell s...@canb.auug.org.au
Cc: Fengguang Wu fengguang...@intel.com
---
 include/linux/idr.h | 118 +---
 lib/idr.c   | 776 +---
 2 files changed, 573 insertions(+), 321 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index c0e0c54..a78cf99 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -17,6 +17,88 @@
 #include linux/init.h
 #include linux/rcupdate.h
 
+/* IDA */
+
+#define IDA_INLINE_NODES   1
+
+struct ida {
+   spinlock_t  lock;
+
+   /*
+* cur_id and allocated_ids are for ida_alloc_cyclic. For cyclic
+* allocations we search for new ids to allocate starting from the last
+* id allocated - cur_id is the next id to try allocating.
+*
+* But we also don't want the allocated ids to be arbitrarily sparse -
+* the memory usage for the bitmap could be arbitrarily bad, and if
+* they're used as keys in a radix tree the memory overhead of the radix
+* tree could be quite bad as well. So we use allocated_ids to decide
+* when to restart cur_id from 0, and bound how sparse the bitmap can
+* be.
+*/
+   unsignedcur_id;
+   unsignedallocated_ids;
+
+   /* size of ida-tree */
+   unsignednodes;
+
+   /*
+* Index of first leaf node in ida-tree; equal to the number of non
+* leaf nodes, ida-nodes - ida-first_leaf == number of leaf nodes
+*/
+   unsignedfirst_leaf;
+
+   unsigned long   *tree;
+   unsigned long   inline_nodes[IDA_INLINE_NODES];
+};
+
+#define IDA_INIT(name) \
+{  \
+   .lock   = __SPIN_LOCK_UNLOCKED(name.lock),  \
+   .nodes  = IDA_INLINE_NODES, \
+   .first_leaf = 1,\
+   .tree   = name.inline_nodes,\
+}
+#define DEFINE_IDA(name)   struct ida name = IDA_INIT(name)
+
+void ida_remove(struct ida *ida, unsigned id);
+int ida_alloc_range(struct ida *ida, unsigned int start,
+ unsigned int end, gfp_t gfp);
+int ida_alloc_cyclic(struct ida *ida, unsigned start, unsigned end, gfp_t gfp);
+void ida_destroy(struct ida *ida);
+int ida_init_prealloc(struct ida *ida, unsigned prealloc);
+
+/**
+ * ida_alloc_range - allocate a new id.
+ * @ida: the (initialized) ida.
+ * @gfp_mask: memory allocation flags
+ *
+ * Allocates an id in the range [0, INT_MAX]. Returns

[PATCH 06/10] idr: Rename idr_get_next() - idr_find_next()

2013-06-18 Thread Kent Overstreet
get() implies taking a ref or sometimes an allocation, which this
function definitely does not do - rename it to something more sensible.

Signed-off-by: Kent Overstreet koverstr...@google.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Tejun Heo t...@kernel.org
---
 drivers/block/drbd/drbd_main.c | 2 +-
 drivers/block/drbd/drbd_nl.c   | 2 +-
 drivers/mtd/mtdcore.c  | 2 +-
 include/linux/idr.h| 4 ++--
 lib/idr.c  | 6 +++---
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a5dca6a..b84e4b2 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -500,7 +500,7 @@ int conn_lowest_minor(struct drbd_tconn *tconn)
int vnr = 0, m;
 
rcu_read_lock();
-   mdev = idr_get_next(tconn-volumes, vnr);
+   mdev = idr_find_next(tconn-volumes, vnr);
m = mdev ? mdev_to_minor(mdev) : -1;
rcu_read_unlock();
 
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9e3f441..8fa7eb1 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2837,7 +2837,7 @@ int get_one_status(struct sk_buff *skb, struct 
netlink_callback *cb)
}
if (tconn) {
 next_tconn:
-   mdev = idr_get_next(tconn-volumes, volume);
+   mdev = idr_find_next(tconn-volumes, volume);
if (!mdev) {
/* No more volumes to dump on this tconn.
 * Advance tconn iterator. */
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index c400c57..eaa1fcc 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(mtd_table_mutex);
 
 struct mtd_info *__mtd_next_device(int i)
 {
-   return idr_get_next(mtd_idr, i);
+   return idr_find_next(mtd_idr, i);
 }
 EXPORT_SYMBOL_GPL(__mtd_next_device);
 
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 35df6c7..ae3a3b4 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -209,7 +209,7 @@ int idr_alloc(struct idr *idp, void *ptr, int start, int 
end, gfp_t gfp_mask);
 int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t 
gfp_mask);
 int idr_for_each(struct idr *idp,
 int (*fn)(int id, void *p, void *data), void *data);
-void *idr_get_next(struct idr *idp, int *nextid);
+void *idr_find_next(struct idr *idp, int *nextid);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_free(struct idr *idp, int id);
@@ -260,7 +260,7 @@ static inline void *idr_find(struct idr *idr, int id)
  * is convenient for a not found value.
  */
 #define idr_for_each_entry(idp, entry, id) \
-   for (id = 0; ((entry) = idr_get_next(idp, (id))) != NULL; ++id)
+   for (id = 0; ((entry) = idr_find_next(idp, (id))) != NULL; ++id)
 
 void __init idr_init_cache(void);
 
diff --git a/lib/idr.c b/lib/idr.c
index dae0920..21f86e8 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -1506,7 +1506,7 @@ int idr_for_each(struct idr *idp,
 EXPORT_SYMBOL(idr_for_each);
 
 /**
- * idr_get_next - lookup next object of id to given id.
+ * idr_find_next - lookup next object of id to given id.
  * @idp: idr handle
  * @nextidp:  pointer to lookup key
  *
@@ -1517,7 +1517,7 @@ EXPORT_SYMBOL(idr_for_each);
  * This function can be called under rcu_read_lock(), given that the leaf
  * pointers lifetimes are correctly managed.
  */
-void *idr_get_next(struct idr *idp, int *nextidp)
+void *idr_find_next(struct idr *idp, int *nextidp)
 {
struct idr_layer *p, *pa[MAX_IDR_LEVEL + 1];
struct idr_layer **paa = pa[0];
@@ -1558,7 +1558,7 @@ void *idr_get_next(struct idr *idp, int *nextidp)
}
return NULL;
 }
-EXPORT_SYMBOL(idr_get_next);
+EXPORT_SYMBOL(idr_find_next);
 
 
 /**
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/10] idr: Kill old deprecated idr interfaces

2013-06-18 Thread Kent Overstreet
The deprecated idr interfaces don't have any in kernel users, so let's
delete them as prep work for the idr rewrite.

Signed-off-by: Kent Overstreet koverstr...@google.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Tejun Heo t...@kernel.org
---
 include/linux/idr.h | 63 -
 lib/idr.c   | 36 +++---
 2 files changed, 3 insertions(+), 96 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 3ba796b..35df6c7 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -262,69 +262,6 @@ static inline void *idr_find(struct idr *idr, int id)
 #define idr_for_each_entry(idp, entry, id) \
for (id = 0; ((entry) = idr_get_next(idp, (id))) != NULL; ++id)
 
-/*
- * Don't use the following functions.  These exist only to suppress
- * deprecated warnings on EXPORT_SYMBOL()s.
- */
-int __idr_pre_get(struct idr *idp, gfp_t gfp_mask);
-int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
-void __idr_remove_all(struct idr *idp);
-
-/**
- * idr_pre_get - reserve resources for idr allocation
- * @idp:   idr handle
- * @gfp_mask:  memory allocation flags
- *
- * Part of old alloc interface.  This is going away.  Use
- * idr_preload[_end]() and idr_alloc() instead.
- */
-static inline int __deprecated idr_pre_get(struct idr *idp, gfp_t gfp_mask)
-{
-   return __idr_pre_get(idp, gfp_mask);
-}
-
-/**
- * idr_get_new_above - allocate new idr entry above or equal to a start id
- * @idp: idr handle
- * @ptr: pointer you want associated with the id
- * @starting_id: id to start search at
- * @id: pointer to the allocated handle
- *
- * Part of old alloc interface.  This is going away.  Use
- * idr_preload[_end]() and idr_alloc() instead.
- */
-static inline int __deprecated idr_get_new_above(struct idr *idp, void *ptr,
-int starting_id, int *id)
-{
-   return __idr_get_new_above(idp, ptr, starting_id, id);
-}
-
-/**
- * idr_get_new - allocate new idr entry
- * @idp: idr handle
- * @ptr: pointer you want associated with the id
- * @id: pointer to the allocated handle
- *
- * Part of old alloc interface.  This is going away.  Use
- * idr_preload[_end]() and idr_alloc() instead.
- */
-static inline int __deprecated idr_get_new(struct idr *idp, void *ptr, int *id)
-{
-   return __idr_get_new_above(idp, ptr, 0, id);
-}
-
-/**
- * idr_remove_all - remove all ids from the given idr tree
- * @idp: idr handle
- *
- * If you're trying to destroy @idp, calling idr_destroy() is enough.
- * This is going away.  Don't use.
- */
-static inline void __deprecated idr_remove_all(struct idr *idp)
-{
-   __idr_remove_all(idp);
-}
-
 void __init idr_init_cache(void);
 
 #endif /* __IDR_H__ */
diff --git a/lib/idr.c b/lib/idr.c
index 4962e5d..dae0920 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -982,19 +982,6 @@ static void idr_mark_full(struct idr_layer **pa, int id)
}
 }
 
-int __idr_pre_get(struct idr *idp, gfp_t gfp_mask)
-{
-   while (idp-id_free_cnt  MAX_IDR_FREE) {
-   struct idr_layer *new;
-   new = kmem_cache_zalloc(idr_layer_cache, gfp_mask);
-   if (new == NULL)
-   return (0);
-   move_to_free_list(idp, new);
-   }
-   return 1;
-}
-EXPORT_SYMBOL(__idr_pre_get);
-
 /**
  * sub_alloc - try to allocate an id without growing the tree depth
  * @idp: idr handle
@@ -1160,21 +1147,6 @@ static void idr_fill_slot(struct idr *idr, void *ptr, 
int id,
idr_mark_full(pa, id);
 }
 
-int __idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id)
-{
-   struct idr_layer *pa[MAX_IDR_LEVEL + 1];
-   int rv;
-
-   rv = idr_get_empty_slot(idp, starting_id, pa, 0, idp);
-   if (rv  0)
-   return rv == -ENOMEM ? -EAGAIN : rv;
-
-   idr_fill_slot(idp, ptr, rv, pa);
-   *id = rv;
-   return 0;
-}
-EXPORT_SYMBOL(__idr_get_new_above);
-
 /**
  * idr_preload - preload for idr_alloc()
  * @gfp_mask: allocation mask to use for preloading
@@ -1395,7 +1367,7 @@ void idr_remove(struct idr *idp, int id)
 }
 EXPORT_SYMBOL(idr_remove);
 
-void __idr_remove_all(struct idr *idp)
+static void __idr_remove_all(struct idr *idp)
 {
int n, id, max;
int bt_mask;
@@ -1428,7 +1400,6 @@ void __idr_remove_all(struct idr *idp)
}
idp-layers = 0;
 }
-EXPORT_SYMBOL(__idr_remove_all);
 
 /**
  * idr_destroy - release all cached layers within an idr tree
@@ -1490,13 +1461,12 @@ EXPORT_SYMBOL(idr_find_slowpath);
  * callback function will be called for each pointer currently
  * registered, passing the id, the pointer and the data pointer passed
  * to this function.  It is not safe to modify the idr tree while in
- * the callback, so functions such as idr_get_new and idr_remove are
- * not allowed.
+ * the callback, so functions such as idr_remove are not allowed.
  *
  * We check

[PATCH 08/10] idr: Reimplement idr on top of ida/radix trees

2013-06-18 Thread Kent Overstreet
The old idr code was really a second radix tree implementation - we
already have one in lib/radix-tree.c.

This patch reimplements idr on top of our existing radix trees, using
our shiny new ida implementation for allocating/freeing the ids. The old
idr code was noticably slower than lib/radix-tree.c in at least some
benchmarks, so in addition to being ~500 lines less code this patch
should improve performance too.

There's one thing left unfinished in this patch - the existing
idr_preload() interface won't work for ida. Another patch on top of this
will fix idr_preload() and update existing users to the new interface.

Signed-off-by: Kent Overstreet koverstr...@google.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Tejun Heo t...@kernel.org
---
 include/linux/idr.h | 157 -
 init/main.c |   1 -
 lib/idr.c   | 896 ++--
 3 files changed, 249 insertions(+), 805 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 4bce55c..ec789f5 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -1,6 +1,6 @@
 /*
  * include/linux/idr.h
- * 
+ *
  * 2002-10-18  written by Jim Houston jim.hous...@ccur.com
  * Copyright (C) 2002 by Concurrent Computer Corporation
  * Distributed under the GNU GPL license version 2.
@@ -12,10 +12,8 @@
 #ifndef __IDR_H__
 #define __IDR_H__
 
-#include linux/types.h
-#include linux/bitops.h
-#include linux/init.h
-#include linux/rcupdate.h
+#include linux/gfp.h
+#include linux/radix-tree.h
 #include linux/spinlock_types.h
 #include linux/wait.h
 
@@ -147,74 +145,42 @@ int percpu_ida_init(struct percpu_ida *pool, unsigned 
long nr_tags);
 
 /* IDR */
 
-/*
- * We want shallower trees and thus more bits covered at each layer.  8
- * bits gives us large enough first layer for most use cases and maximum
- * tree depth of 4.  Each idr_layer is slightly larger than 2k on 64bit and
- * 1k on 32bit.
+/**
+ * DOC: idr sync
+ * idr synchronization (stolen from radix-tree.h)
+ *
+ * idr_alloc() and idr_remove() do their own locking internally - the user need
+ * not be concerned with synchronization unless there's other operations that
+ * need to be done atomically.
+ *
+ * idr_find() does no locking - it can be called locklessly using RCU, if the
+ * caller ensures calls to this function are made within rcu_read_lock()
+ * regions and does all the other appropriate RCU stuff.
  */
-#define IDR_BITS 8
-#define IDR_SIZE (1  IDR_BITS)
-#define IDR_MASK ((1  IDR_BITS)-1)
-
-struct idr_layer {
-   int prefix; /* the ID prefix of this idr_layer */
-   DECLARE_BITMAP(bitmap, IDR_SIZE); /* A zero bit means space here */
-   struct idr_layer __rcu  *ary[1IDR_BITS];
-   int count;  /* When zero, we can release it */
-   int layer;  /* distance from leaf */
-   struct rcu_head rcu_head;
-};
 
 struct idr {
-   struct idr_layer __rcu  *hint;  /* the last layer allocated from */
-   struct idr_layer __rcu  *top;
-   struct idr_layer*id_free;
-   int layers; /* only valid w/o concurrent changes */
-   int id_free_cnt;
-   int cur;/* current pos for cyclic allocation */
-   spinlock_t  lock;
+   struct ida  ida;
+   struct radix_tree_root  ptrs;
 };
 
 #define IDR_INIT(name) \
 {  \
-   .lock   = __SPIN_LOCK_UNLOCKED(name.lock),  \
+   .ida= IDA_INIT(name.ida),   \
+   .ptrs   = RADIX_TREE_INIT(GFP_NOWAIT),  \
 }
 #define DEFINE_IDR(name)   struct idr name = IDR_INIT(name)
 
-/**
- * DOC: idr sync
- * idr synchronization (stolen from radix-tree.h)
- *
- * idr_find() is able to be called locklessly, using RCU. The caller must
- * ensure calls to this function are made within rcu_read_lock() regions.
- * Other readers (lock-free or otherwise) and modifications may be running
- * concurrently.
- *
- * It is still required that the caller manage the synchronization and
- * lifetimes of the items. So if RCU lock-free lookups are used, typically
- * this would mean that the items have their own locks, or are amenable to
- * lock-free access; and that the items are freed by RCU (or only freed after
- * having been deleted from the idr tree *and* a synchronize_rcu() grace
- * period).
- */
-
-/*
- * This is what we export.
- */
-
-void *idr_find_slowpath(struct idr *idp, int id);
-void idr_preload(gfp_t gfp_mask);
-int idr_alloc_range(struct idr *idp, void *ptr, int start, int end, gfp_t 
gfp_mask);
-int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t 
gfp_mask);
-int idr_for_each(struct idr *idp,
+void *idr_find_next(struct idr *idr, int *nextid);
+int

[PATCH 04/10] idr: Percpu ida

2013-06-18 Thread Kent Overstreet
Percpu frontend for allocating ids. With percpu allocation (that works),
it's impossible to guarantee it will always be possible to allocate all
nr_tags - typically, some will be stuck on a remote percpu freelist
where the current job can't get to them.

We do guarantee that it will always be possible to allocate at least
(nr_tags / 2) tags - this is done by keeping track of which and how many
cpus have tags on their percpu freelists. On allocation failure if
enough cpus have tags that there could potentially be (nr_tags / 2) tags
stuck on remote percpu freelists, we then pick a remote cpu at random to
steal from.

Note that the synchronization is _definitely_ tricky - we're using
xchg()/cmpxchg() on the percpu lists, to synchronize between
steal_tags().

The alternative would've been adding a spinlock to protect the percpu
freelists, but that would've required some different tricky code to
avoid deadlock because of the lock ordering.

Note that there's no cpu hotplug notifier - we don't care, because
steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
more allocations if we had a notifier - but we'll still meet our
guarantees and it's absolutely not a correctness issue, so I don't think
it's worth the extra code.

Signed-off-by: Kent Overstreet koverstr...@google.com
Cc: Tejun Heo t...@kernel.org
Cc: Oleg Nesterov o...@redhat.com
Cc: Christoph Lameter c...@linux-foundation.org
Cc: Ingo Molnar mi...@redhat.com
Cc: Andi Kleen a...@firstfloor.org
Cc: Jens Axboe ax...@kernel.dk
Cc: Nicholas A. Bellinger n...@linux-iscsi.org
---
 include/linux/idr.h |  48 
 lib/idr.c   | 312 ++--
 2 files changed, 352 insertions(+), 8 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index a78cf99..3ba796b 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -16,6 +16,8 @@
 #include linux/bitops.h
 #include linux/init.h
 #include linux/rcupdate.h
+#include linux/spinlock_types.h
+#include linux/wait.h
 
 /* IDA */
 
@@ -97,6 +99,52 @@ static inline void ida_init(struct ida *ida)
ida_init_prealloc(ida, 0);
 }
 
+/* Percpu IDA/tag allocator */
+
+struct percpu_ida_cpu;
+
+struct percpu_ida {
+   /*
+* number of tags available to be allocated, as passed to
+* percpu_ida_init()
+*/
+   unsignednr_tags;
+
+   struct percpu_ida_cpu __percpu  *tag_cpu;
+
+   /*
+* Bitmap of cpus that (may) have tags on their percpu freelists:
+* steal_tags() uses this to decide when to steal tags, and which cpus
+* to try stealing from.
+*
+* It's ok for a freelist to be empty when its bit is set - steal_tags()
+* will just keep looking - but the bitmap _must_ be set whenever a
+* percpu freelist does have tags.
+*/
+   unsigned long   *cpus_have_tags;
+
+   struct {
+   /*
+* When we go to steal tags from another cpu (see steal_tags()),
+* we want to pick a cpu at random. Cycling through them every
+* time we steal is a bit easier and more or less equivalent:
+*/
+   unsignedcpu_last_stolen;
+
+   /* For sleeping on allocation failure */
+   wait_queue_head_t   wait;
+
+   /* Global freelist */
+   struct ida  ida;
+   } cacheline_aligned_in_smp;
+};
+
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
+
+void percpu_ida_destroy(struct percpu_ida *pool);
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
+
 /* IDR */
 
 /*
diff --git a/lib/idr.c b/lib/idr.c
index e6b837a..4962e5d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -28,17 +28,20 @@
  * with the slab allocator.
  */
 
-#ifndef TEST// to test in user space...
-#include linux/slab.h
-#include linux/init.h
-#include linux/export.h
-#endif
+#include linux/bitmap.h
+#include linux/bitops.h
+#include linux/bug.h
 #include linux/err.h
-#include linux/string.h
+#include linux/export.h
+#include linux/hardirq.h
 #include linux/idr.h
-#include linux/spinlock.h
+#include linux/init.h
+#include linux/kernel.h
 #include linux/percpu.h
-#include linux/hardirq.h
+#include linux/sched.h
+#include linux/slab.h
+#include linux/string.h
+#include linux/spinlock.h
 
 #define IDA_TREE_ARY   BITS_PER_LONG
 
@@ -527,6 +530,299 @@ int ida_init_prealloc(struct ida *ida, unsigned prealloc)
 }
 EXPORT_SYMBOL(ida_init_prealloc);
 
+/* Percpu IDA */
+
+/*
+ * Number of tags we move between the percpu freelist and the global freelist 
at
+ * a time
+ */
+#define TAG_CPU_BATCH_MOVE 32U
+
+/* Max size of percpu freelist, */
+#define TAG_CPU_SIZE   (TAG_CPU_BATCH_MOVE + TAG_CPU_BATCH_MOVE / 2)
+
+struct percpu_ida_cpu {
+   spinlock_t

Re: [PATCH] Percpu tag allocator

2013-06-18 Thread Kent Overstreet
On Thu, Jun 13, 2013 at 12:23:39PM -0700, Andrew Morton wrote:
 On Thu, 13 Jun 2013 12:15:07 -0700 Tejun Heo t...@kernel.org wrote:
  Oh, I'm sure the current id[r|a] can be improved upon a lot but I'm
  very skeptical one can reach the level of scalability necessary for,
  say, pci-e attached extremely high-iops devices while still keeping
  the lowest number allocation, which can't be achieved without strong
  synchronization on each alloc/free.
  
  Maybe we can layer things so that we have percpu layer on top of
  id[r|a] and, say, mapping id to point is still done by idr, or the
  percpu tag allocator uses ida for tag chunk allocations, but it's
  still gonna be something extra on top.
 
 It's not obvious that explicit per-cpu is needed.  Get an ID from
 ida_get_new_above(), multiply it by 16 and store that in device-local
 storage, along with a 16-bit bitmap.  Blam, 30 lines of code and the
 ida_get_new_above() cost is reduced 16x and it's off the map.

I was just rereading emails and realized I should've replied to this.

So, I started out aiming for something simpler. I don't quite follow
what the approach you're suggesting is, but it ends up you really do
need the percpu freelists (buffering batching/freeing from the main
freelist) because ids/tags may not be freed on the cpu they were
allocated on.

In particular, if this is for a driver and the device doesn't implement
per cpu queues, tags are almost always going to be freed on a different
cpu. If you just give each cpu a fraction of the tag space they always
allocate out of (with ida_get_new_above() or similar) - that only helps
allocation, half the cacheline contention is freeing.

I originally wrote this tag allocator to use in a driver for a device
that didn't support multiple hardware queues at all, but it was fast
enough that any cacheline bouncing really hurt.

So that right there gets you to the basic design where you've got a
global freelist and percpu freelists, and you just use the percpu
freelists to batch up allocation/freeing to the global freelist.

The tag allocator I was going to submit for the aio code was pretty much
just that, nothing more. It was simple. It worked. I was happy with it.

The one concern with this approach is what happens when all the percpu
freelists except your are full of free tags. Originally, I had an easy
solution - we calculate the size of the percpu freelists based on
nr_tags and num_possible_cpus, so that there can't be more than half of
the tag space stranded like this.

(Keep in mind the main use case is drivers where the tag/id is used to
talk to the device, so you're limited by whatever the hardware people
thought was enough - 16 bits if you're lucky).

But then Tejun went and pointed out, just as I was about to mail it off
- Hey, what happens if you've got 4096 cpus and not all that many tags?
Youv'e got a nice divice by zero in there.

After which I proceeded to swear at him a bit, but - well, it's a real
problem. And that is what led to the tag stealing stuff and all the
cmpxchg() shenanigans. And I'm pretty happy with the solution - there's
an elegance to it and I bet if I cared I could come up with a proof that
it's more or less optimal w.r.t. cacheline bounces for some decent
fraction of workloads we might care about. But yeah, it's not as simple
as I would've liked.

Anyways, now you've got an ida/idr api cleanup/rewrite to go along with
it, and it's all nicely integrated. Integrating the percpu tag allocator
with regular ida really doesn't save us any code - the original global
freelist was a stack and like 10 lines of code total.

But having the apis be consistent and having it all be organized and
pretty is nice. I think it is now, anyways :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 03/10] idr: Rewrite ida

2013-06-19 Thread Kent Overstreet
On Wed, Jun 19, 2013 at 10:40:22AM +0100, Steven Whitehouse wrote:
 Hi,
 
 On Tue, 2013-06-18 at 17:02 -0700, Kent Overstreet wrote:
  This is a new, from scratch implementation of ida that should be
  simpler, faster and more space efficient.
  
 [...]
 
  
  This does mean that the entire bitmap is stored in one contiguous memory
  allocation - and as currently implemented we won't be able to allocate
  _quite_ as many ids as with the previous implementation.
  
  I don't expect this to be an issue in practice since anywhere this is
  used, an id corresponds to a struct allocation somewher else - we can't
  allocate an unbounded number of ids, we'll run out of memory somewhere
  else eventually, and I expect that to be the limiting factor in
  practice.
  
  If a user/use case does come up where this matters I can add some
  sharding (or perhaps add a separate big_ida implementation) - but the
  extra complexity would adversely affect performance for the users that
  don't need  millions of ids, so I intend to leave the implementation as
  is until if and when this becomes an issue.
  
 
 Millions of IDs is something that is fairly normal for DLM, since there
 will be two DLM locks per cached inode with GFS2 and people tend to use
 it on pretty large servers with lots of memory,

Thanks, I wasn't aware of that. Is the 31 bits for the id limitation an
issue for you? While I'm at changing ids to longs should be fairly
trivial.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] Bcache fixes for 3.11

2013-07-16 Thread Kent Overstreet
Hey Jens - I've been busy torture testing and chasing bugs, here's the
fruits of my labors. These are all fairly small fixes, some of them
quite important:

The following changes since commit 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7:

  bcache: Use standard utility code (2013-07-01 14:43:53 -0700)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.11

for you to fetch changes up to 79826c35eb99cd3c0873b8396f45fa26c87fb0b0:

  bcache: Allocation kthread fixes (2013-07-12 00:22:49 -0700)


Dan Carpenter (1):
  bcache: check for allocation failures

Kent Overstreet (7):
  bcache: Fix a dumb race
  bcache: Advertise that flushes are supported
  bcache: Fix a sysfs splat on shutdown
  bcache: Shutdown fix
  bcache: Journal replay fix
  bcache: Fix GC_SECTORS_USED() calculation
  bcache: Allocation kthread fixes

 drivers/md/bcache/alloc.c   | 18 --
 drivers/md/bcache/bcache.h  |  5 +
 drivers/md/bcache/btree.c   |  4 +++-
 drivers/md/bcache/closure.c |  6 --
 drivers/md/bcache/journal.c |  7 ++-
 drivers/md/bcache/request.c |  8 +++-
 drivers/md/bcache/super.c   | 42 ++
 drivers/md/bcache/sysfs.c   |  2 ++
 8 files changed, 61 insertions(+), 31 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] Bcache changes for 3.12

2013-09-10 Thread Kent Overstreet
Jens, here's the bcache stuff for 3.12. This is on top of the pull
request I sent you of fixes for 3.11. Pull away!

The following changes since commit bef0ef06a9fe34b08177b67d3213dceab29d3abe:

  bcache: Fix for handling overlapping extents when reading in a btree node 
(2013-09-03 13:42:35 -0700)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.12

for you to fetch changes up to 2879736e8a022780e28da9e4502815b510fcfe8b:

  bcache: Bypass torture test (2013-09-10 19:08:32 -0700)


Kent Overstreet (49):
  bcache: Use blkdev_issue_discard()
  bcache: Add on error panic/unregister setting
  bcache: Stripe size isn't necessarily a power of two
  bcache: Remove unnecessary check in should_split()
  bcache: Explicitly track btree node's parent
  bcache: Add btree_insert_node()
  bcache: Insert multiple keys at a time
  bcache: Convert btree_insert_check_key() to btree_insert_node()
  bcache: Add explicit keylist arg to btree_insert()
  bcache: Clean up keylist code
  bcache: Refactor request_write()
  bcache: Refactor read request code a bit
  bcache: Refactor journalling flow control
  bcache: Move keylist out of btree_op
  bcache: Convert try_wait to wait_queue_head_t
  bcache: Convert bucket_wait to wait_queue_head_t
  bcache: Convert gc to a kthread
  bcache: Convert writeback to a kthread
  bcache: Add btree_map() functions
  bcache: Move some stuff to btree.c
  bcache: Convert bch_btree_read_async() to bch_btree_map_keys()
  bcache: Clean up cache_lookup_fn
  bcache: Prune struct btree_op
  bcache: Kill op-cl
  bcache: Drop some closure stuff
  bcache: Kill op-replace
  bcache: Don't use op-insert_collision
  bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes()
  bcache: Break up struct search
  bcache: Move sector allocator to alloc.c
  bcache: Pull on disk data structures out into a separate header
  bcache: Fix bch_ptr_bad()
  bcache: Debug code improvements
  bcache: Don't bother with bucket refcount for btree node allocations
  bcache: bch_(btree|extent)_ptr_invalid()
  bcache: PRECEDING_KEY()
  bcache: Add btree_node_write_sync()
  bcache: Add make_btree_freeing_key()
  bcache: Incremental gc
  bcache: Avoid deadlocking in garbage collection
  bcache: Kill bch_next_recurse_key()
  bcache: Kill sequential_merge option
  bcache: Move spinlock into struct time_stats
  bcache: Have btree_split() insert into parent directly
  bcache: Better full stripe scanning
  bcache: Fix sysfs splat on shutdown with flash only devs
  bcache: Use ida for bcache block dev minor
  bcache: Delete some slower inline asm
  bcache: Bypass torture test

 drivers/md/bcache/Kconfig |   11 +-
 drivers/md/bcache/alloc.c |  383 +++-
 drivers/md/bcache/bcache.h|  327 +-
 drivers/md/bcache/bset.c  |  289 -
 drivers/md/bcache/bset.h  |   93 +--
 drivers/md/bcache/btree.c | 1385 ++---
 drivers/md/bcache/btree.h |  195 ++
 drivers/md/bcache/closure.c   |   25 +-
 drivers/md/bcache/closure.h   |  110 +---
 drivers/md/bcache/debug.c |  170 +++--
 drivers/md/bcache/debug.h |   50 +-
 drivers/md/bcache/journal.c   |  290 -
 drivers/md/bcache/journal.h   |   49 +-
 drivers/md/bcache/movinggc.c  |   85 ++-
 drivers/md/bcache/request.c   | 1106 
 drivers/md/bcache/request.h   |   43 +-
 drivers/md/bcache/stats.c |   26 +-
 drivers/md/bcache/stats.h |   13 +-
 drivers/md/bcache/super.c |  188 +++---
 drivers/md/bcache/sysfs.c |   42 +-
 drivers/md/bcache/trace.c |1 -
 drivers/md/bcache/util.c  |   12 +-
 drivers/md/bcache/util.h  |   15 +-
 drivers/md/bcache/writeback.c |  455 +++---
 drivers/md/bcache/writeback.h |   47 +-
 include/trace/events/bcache.h |   47 +-
 include/uapi/linux/bcache.h   |  373 +++
 27 files changed, 2966 insertions(+), 2864 deletions(-)
 create mode 100644 include/uapi/linux/bcache.h
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[GIT PULL] bcache changes for 3.13

2013-11-04 Thread Kent Overstreet
Hey Jens, sorry for being late with this - anyways, it's roughly the same set of
patches you had queued up before plus a few minor fixes, but it's been rebased
onto your for-3.13/core branch.


The following changes since commit febca1baea1cfe2d7a0271385d89b03d5fb34f94:

  block: setup bi_vcnt on clones (2013-10-31 13:32:42 -0600)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git bcache-for-3.13

for you to fetch changes up to 0bb953dba5d295b03bb035bfc2d28ed4155df377:

  bcache: defensively handle format strings (2013-11-01 18:03:20 -0700)


Kees Cook (1):
  bcache: defensively handle format strings

Kent Overstreet (53):
  bcache: Fixed incorrect order of arguments to bio_alloc_bioset()
  bcache: Convert bch_data_verify() to immutable biovecs
  bcache: Fix a journalling performance bug
  bcache: Fix a lockdep splat
  bcache: Use blkdev_issue_discard()
  bcache: Add on error panic/unregister setting
  bcache: Stripe size isn't necessarily a power of two
  bcache: Remove unnecessary check in should_split()
  bcache: Explicitly track btree node's parent
  bcache: Add btree_insert_node()
  bcache: Insert multiple keys at a time
  bcache: Convert btree_insert_check_key() to btree_insert_node()
  bcache: Add explicit keylist arg to btree_insert()
  bcache: Clean up keylist code
  bcache: Refactor request_write()
  bcache: Refactor read request code a bit
  bcache: Refactor journalling flow control
  bcache: Move keylist out of btree_op
  bcache: Convert try_wait to wait_queue_head_t
  bcache: Convert bucket_wait to wait_queue_head_t
  bcache: Convert gc to a kthread
  bcache: Convert writeback to a kthread
  bcache: Add btree_map() functions
  bcache: Move some stuff to btree.c
  bcache: Convert bch_btree_read_async() to bch_btree_map_keys()
  bcache: Clean up cache_lookup_fn
  bcache: Prune struct btree_op
  bcache: Kill op-cl
  bcache: Drop some closure stuff
  bcache: Kill op-replace
  bcache: Don't use op-insert_collision
  bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes()
  bcache: Break up struct search
  bcache: Move sector allocator to alloc.c
  bcache: Pull on disk data structures out into a separate header
  bcache: Fix bch_ptr_bad()
  bcache: Debug code improvements
  bcache: Don't bother with bucket refcount for btree node allocations
  bcache: bch_(btree|extent)_ptr_invalid()
  bcache: PRECEDING_KEY()
  bcache: Add btree_node_write_sync()
  bcache: Add make_btree_freeing_key()
  bcache: Incremental gc
  bcache: Avoid deadlocking in garbage collection
  bcache: Kill bch_next_recurse_key()
  bcache: Kill sequential_merge option
  bcache: Move spinlock into struct time_stats
  bcache: Have btree_split() insert into parent directly
  bcache: Better full stripe scanning
  bcache: Fix sysfs splat on shutdown with flash only devs
  bcache: Use ida for bcache block dev minor
  bcache: Delete some slower inline asm
  bcache: Bypass torture test

 drivers/md/bcache/Kconfig |   11 +-
 drivers/md/bcache/alloc.c |  383 +++-
 drivers/md/bcache/bcache.h|  327 +-
 drivers/md/bcache/bset.c  |  289 -
 drivers/md/bcache/bset.h  |   93 +--
 drivers/md/bcache/btree.c | 1386 ++---
 drivers/md/bcache/btree.h |  195 ++
 drivers/md/bcache/closure.c   |  103 +--
 drivers/md/bcache/closure.h   |  183 +-
 drivers/md/bcache/debug.c |  202 +++---
 drivers/md/bcache/debug.h |   50 +-
 drivers/md/bcache/journal.c   |  293 +
 drivers/md/bcache/journal.h   |   52 +-
 drivers/md/bcache/movinggc.c  |   85 ++-
 drivers/md/bcache/request.c   | 1106 
 drivers/md/bcache/request.h   |   42 +-
 drivers/md/bcache/stats.c |   26 +-
 drivers/md/bcache/stats.h |   13 +-
 drivers/md/bcache/super.c |  190 +++---
 drivers/md/bcache/sysfs.c |   42 +-
 drivers/md/bcache/trace.c |1 -
 drivers/md/bcache/util.c  |   12 +-
 drivers/md/bcache/util.h  |   15 +-
 drivers/md/bcache/writeback.c |  455 +++---
 drivers/md/bcache/writeback.h |   46 +-
 include/trace/events/bcache.h |   47 +-
 include/uapi/linux/bcache.h   |  373 +++
 27 files changed, 3006 insertions(+), 3014 deletions(-)
 create mode 100644 include/uapi/linux/bcache.h
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH for-next] dm: fix missing bi_remaining accounting

2013-11-04 Thread Kent Overstreet
On Mon, Nov 04, 2013 at 10:06:00AM -0500, Mikulas Patocka wrote:
 
 
 On Fri, 1 Nov 2013, Jens Axboe wrote:
 
  On 11/01/2013 07:59 AM, Mike Snitzer wrote:
   Add the missing bi_remaining increment, required by the block layer's
   new bio-chaining code, to both the verity and old snapshot DM targets.
   
   Otherwise users will hit the bi_remaining = 0 BUG_ON in bio_endio().
  
  Thanks Mike, added to the mix.
  
  -- 
  Jens Axboe
 
 Hi
 
 This improves a little bit on the previous patch, by replacing costly 
 atomic_inc with cheap atomic_set.

IMO, this is a bad idea; the behaviour with this patch does _not_ match the
naming of bio_endio_nodec(), and the performance difference should be well in
the noise anyways because we're touching a cacheline we already have in cache
and won't be contended.

The fact that it's currently safe is accidental, I could see this easily
tripping people up and being a pain in the ass to debug in the future.

 
 
 From: Mikulas Patocka mpato...@redhat.com
 
 dm: change atomic_inc to atomic_set(1)
 
 There are places in dm where we save bi_endio and bi_private, set them to
 target's routine, submit the bio, from the target's bi_endio routine we
 restore bi_endio and bi_private and end the bio with bi_endio.
 
 This causes underflow of bi_remaining, so we must restore bi_remaining
 before ending the bio from the target bi_endio routine.
 
 The code uses atomic_inc for restoration of bi_remaining. This patch
 changes it to atomic_set(1) to avoid an interlocked instruction. In the
 target's bi_endio routine we are sure that bi_remaining is zero
 (otherwise, the bi_endio routine wouldn't be called) and there are no
 concurrent users of the bio, so we can replace atomic_inc with
 atomic_set(1).
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [GIT PULL] bcache changes for 3.13

2013-11-04 Thread Kent Overstreet
On Mon, Nov 04, 2013 at 01:12:17PM -0700, Jens Axboe wrote:
 On 11/04/2013 01:07 PM, Kent Overstreet wrote:
  Hey Jens, sorry for being late with this - anyways, it's roughly the same 
  set of
  patches you had queued up before plus a few minor fixes, but it's been 
  rebased
  onto your for-3.13/core branch.
 
 Merge window is a little later this time, and since it was already in
 for the previous release, we can make it happen.
 
 But can you please base it on for-3.13/drivers however? That's where I'd
 like to pull in the driver bits, and if I pull this one, then things get
 even more tangled since for-3.13/core is holding the blk-mq changes too.

I'd like to do the merge with immutable biovecs myself though - there's some
tricky bits in there. How will that work?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] Block layer stuff/DIO rewrite prep for 3.14

2013-11-04 Thread Kent Overstreet
Now that immutable biovecs is in, these are the remaining patches required for
my DIO rewrite, along with some related cleanup/refactoring.

The key enabler is patch 4 - making generic_make_request() handle arbitary sized
bios. This takes what was once bio_add_page()'s responsibility and pushes it
down; long term plan is to hopefully push this down to the driver level, which
should simplify a lot of code (a lot of it very fragile code!) and improve
performance at the same time.

The DIO rewrite needs some more work before it'll be ready, but I wanted to get
this patch series out because this stuff should all be ready and it's useful for
other reasons - I think this stuff will make it a lot easier for the btrfs
people to do what they need with their DIO code.

Other stuff enabled by this:

 * With this and the bio_split() rewrite already in, we can just delete
   merge_bvec_fn. I have patches for this, but they'll need more testing.

 * Multipage bvecs - bvecs pointing to an arbitrary amount of contiguous
   physical memory. I have this working but it'll need more testing and code
   auditing - this is what lets us kill bi_seg_front_size and bi_seg_back_size
   though.

Patch series is based on Jens' for-next tree, and it's available in my git
repository - git://evilpiepirate.org/~kent/linux-bcache.git for-jens

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/9] block: submit_bio_wait() conversions

2013-11-04 Thread Kent Overstreet
It was being open coded in a few places.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Joern Engel jo...@logfs.org
Cc: Prasad Joshi prasadjoshi.li...@gmail.com
---
 block/blk-flush.c   | 19 +--
 fs/logfs/dev_bdev.c |  8 +---
 2 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 5580b05..9288aaf 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
}
 }
 
-static void bio_end_flush(struct bio *bio, int err)
-{
-   if (err)
-   clear_bit(BIO_UPTODATE, bio-bi_flags);
-   if (bio-bi_private)
-   complete(bio-bi_private);
-   bio_put(bio);
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:  blockdev to issue flush for
@@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
 {
-   DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
int ret = 0;
@@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
return -ENXIO;
 
bio = bio_alloc(gfp_mask, 0);
-   bio-bi_end_io = bio_end_flush;
bio-bi_bdev = bdev;
-   bio-bi_private = wait;
 
-   bio_get(bio);
-   submit_bio(WRITE_FLUSH, bio);
-   wait_for_completion_io(wait);
+   ret = submit_bio_wait(WRITE_FLUSH, bio);
 
/*
 * The driver must store the error location in -bi_sector, if
@@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
if (error_sector)
*error_sector = bio-bi_iter.bi_sector;
 
-   if (!bio_flagged(bio, BIO_UPTODATE))
-   ret = -EIO;
-
bio_put(bio);
return ret;
 }
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index ca42715..80adce7 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -23,7 +23,6 @@ static int sync_request(struct page *page, struct 
block_device *bdev, int rw)
 {
struct bio bio;
struct bio_vec bio_vec;
-   struct completion complete;
 
bio_init(bio);
bio.bi_max_vecs = 1;
@@ -35,13 +34,8 @@ static int sync_request(struct page *page, struct 
block_device *bdev, int rw)
bio.bi_iter.bi_size = PAGE_SIZE;
bio.bi_bdev = bdev;
bio.bi_iter.bi_sector = page-index * (PAGE_SIZE  9);
-   init_completion(complete);
-   bio.bi_private = complete;
-   bio.bi_end_io = request_complete;
 
-   submit_bio(rw, bio);
-   wait_for_completion(complete);
-   return test_bit(BIO_UPTODATE, bio.bi_flags) ? 0 : -EIO;
+   return submit_bio_wait(rw, bio);
 }
 
 static int bdev_readpage(void *_sb, struct page *page)
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 9/9] block: Add bio_get_user_pages()

2013-11-04 Thread Kent Overstreet
This replaces some of the code that was in __bio_map_user_iov(), and
soon we're going to use this helper in the dio code.

Note that this relies on the recent change to make
generic_make_request() take arbitrary sized bios - we're not using
bio_add_page() here.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 fs/bio.c| 124 +++-
 include/linux/bio.h |   2 +
 2 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index c60bfcb..a10b350 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1124,17 +1124,70 @@ struct bio *bio_copy_user(struct request_queue *q, 
struct rq_map_data *map_data,
 }
 EXPORT_SYMBOL(bio_copy_user);
 
+/**
+ * bio_get_user_pages - pin user pages and add them to a biovec
+ * @bio: bio to add pages to
+ * @uaddr: start of user address
+ * @len: length in bytes
+ * @write_to_vm: bool indicating writing to pages or not
+ *
+ * Pins pages for up to @len bytes and appends them to @bio's bvec array. May
+ * pin only part of the requested pages - @bio need not have room for all the
+ * pages and can already have had pages added to it.
+ *
+ * Returns the number of bytes from @len added to @bio.
+ */
+ssize_t bio_get_user_pages(struct bio *bio, unsigned long uaddr,
+  unsigned long len, int write_to_vm)
+{
+   int ret;
+   unsigned nr_pages, bytes;
+   unsigned offset = offset_in_page(uaddr);
+   struct bio_vec *bv;
+   struct page **pages;
+
+   nr_pages = min_t(size_t,
+DIV_ROUND_UP(len + offset, PAGE_SIZE),
+bio-bi_max_vecs - bio-bi_vcnt);
+
+   bv = bio-bi_io_vec[bio-bi_vcnt];
+   pages = (void *) bv;
+
+   ret = get_user_pages_fast(uaddr, nr_pages, write_to_vm, pages);
+   if (ret  0)
+   return ret;
+
+   bio-bi_vcnt += ret;
+   bytes = ret * PAGE_SIZE - offset;
+
+   while (ret--) {
+   bv[ret].bv_page = pages[ret];
+   bv[ret].bv_len = PAGE_SIZE;
+   bv[ret].bv_offset = 0;
+   }
+
+   bv[0].bv_offset += offset;
+   bv[0].bv_len -= offset;
+
+   if (bytes  len) {
+   bio-bi_io_vec[bio-bi_vcnt - 1].bv_len -= bytes - len;
+   bytes = len;
+   }
+
+   bio-bi_iter.bi_size += bytes;
+
+   return bytes;
+}
+EXPORT_SYMBOL(bio_get_user_pages);
+
 static struct bio *__bio_map_user_iov(struct request_queue *q,
  struct block_device *bdev,
  struct sg_iovec *iov, int iov_count,
  int write_to_vm, gfp_t gfp_mask)
 {
-   int i, j;
-   int nr_pages = 0;
-   struct page **pages;
+   ssize_t ret;
+   int i, nr_pages = 0;
struct bio *bio;
-   int cur_page = 0;
-   int ret, offset;
 
for (i = 0; i  iov_count; i++) {
unsigned long uaddr = (unsigned long)iov[i].iov_base;
@@ -1163,57 +1216,17 @@ static struct bio *__bio_map_user_iov(struct 
request_queue *q,
if (!bio)
return ERR_PTR(-ENOMEM);
 
-   ret = -ENOMEM;
-   pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
-   if (!pages)
-   goto out;
-
for (i = 0; i  iov_count; i++) {
-   unsigned long uaddr = (unsigned long)iov[i].iov_base;
-   unsigned long len = iov[i].iov_len;
-   unsigned long end = (uaddr + len + PAGE_SIZE - 1)  PAGE_SHIFT;
-   unsigned long start = uaddr  PAGE_SHIFT;
-   const int local_nr_pages = end - start;
-   const int page_limit = cur_page + local_nr_pages;
-
-   ret = get_user_pages_fast(uaddr, local_nr_pages,
-   write_to_vm, pages[cur_page]);
-   if (ret  local_nr_pages) {
-   ret = -EFAULT;
-   goto out_unmap;
-   }
-
-   offset = uaddr  ~PAGE_MASK;
-   for (j = cur_page; j  page_limit; j++) {
-   unsigned int bytes = PAGE_SIZE - offset;
+   ret = bio_get_user_pages(bio, (size_t) iov[i].iov_base,
+iov[i].iov_len,
+write_to_vm);
+   if (ret  0)
+   goto out;
 
-   if (len = 0)
-   break;
-   
-   if (bytes  len)
-   bytes = len;
-
-   /*
-* sorry...
-*/
-   if (bio_add_pc_page(q, bio, pages[j], bytes, offset) 
-   bytes)
-   break;
-
-   len -= bytes;
-   offset = 0;
-   }
-
-   cur_page = j

[PATCH 8/9] bcache: generic_make_request() handles large bios now

2013-11-04 Thread Kent Overstreet
So we get to delete our hacky workaround.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/md/bcache/bcache.h|  18 
 drivers/md/bcache/io.c| 100 +-
 drivers/md/bcache/journal.c   |   4 +-
 drivers/md/bcache/request.c   |  16 +++
 drivers/md/bcache/super.c |  33 ++
 drivers/md/bcache/util.h  |   5 ++-
 drivers/md/bcache/writeback.c |   4 +-
 include/linux/bio.h   |  12 -
 8 files changed, 19 insertions(+), 173 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 964353c..8f65331 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -241,19 +241,6 @@ struct keybuf {
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
-struct bio_split_pool {
-   struct bio_set  *bio_split;
-   mempool_t   *bio_split_hook;
-};
-
-struct bio_split_hook {
-   struct closure  cl;
-   struct bio_split_pool   *p;
-   struct bio  *bio;
-   bio_end_io_t*bi_end_io;
-   void*bi_private;
-};
-
 struct bcache_device {
struct closure  cl;
 
@@ -286,8 +273,6 @@ struct bcache_device {
int (*cache_miss)(struct btree *, struct search *,
  struct bio *, unsigned);
int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct io {
@@ -465,8 +450,6 @@ struct cache {
atomic_long_t   meta_sectors_written;
atomic_long_t   btree_sectors_written;
atomic_long_t   sectors_written;
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct gc_stat {
@@ -901,7 +884,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, 
const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
-void bch_generic_make_request(struct bio *, struct bio_split_pool *);
 void __bch_submit_bbio(struct bio *, struct cache_set *);
 void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, 
unsigned);
 
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa..86a0bb8 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,104 +11,6 @@
 
 #include linux/blkdev.h
 
-static unsigned bch_bio_max_sectors(struct bio *bio)
-{
-   struct request_queue *q = bdev_get_queue(bio-bi_bdev);
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   unsigned ret = 0, seg = 0;
-
-   if (bio-bi_rw  REQ_DISCARD)
-   return min(bio_sectors(bio), q-limits.max_discard_sectors);
-
-   bio_for_each_segment(bv, bio, iter) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev= bio-bi_bdev,
-   .bi_sector  = bio-bi_iter.bi_sector,
-   .bi_size= ret  9,
-   .bi_rw  = bio-bi_rw,
-   };
-
-   if (seg == min_t(unsigned, BIO_MAX_PAGES,
-queue_max_segments(q)))
-   break;
-
-   if (q-merge_bvec_fn 
-   q-merge_bvec_fn(q, bvm, bv)  (int) bv.bv_len)
-   break;
-
-   seg++;
-   ret += bv.bv_len  9;
-   }
-
-   ret = min(ret, queue_max_sectors(q));
-
-   WARN_ON(!ret);
-   ret = max_t(int, ret, bio_iovec(bio).bv_len  9);
-
-   return ret;
-}
-
-static void bch_bio_submit_split_done(struct closure *cl)
-{
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   s-bio-bi_end_io = s-bi_end_io;
-   s-bio-bi_private = s-bi_private;
-   bio_endio_nodec(s-bio, 0);
-
-   closure_debug_destroy(s-cl);
-   mempool_free(s, s-p-bio_split_hook);
-}
-
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
-{
-   struct closure *cl = bio-bi_private;
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   if (error)
-   clear_bit(BIO_UPTODATE, s-bio-bi_flags);
-
-   bio_put(bio);
-   closure_put(cl);
-}
-
-void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
-{
-   struct bio_split_hook *s;
-   struct bio *n;
-
-   if (!bio_has_data(bio)  !(bio-bi_rw  REQ_DISCARD))
-   goto submit;
-
-   if (bio_sectors(bio) = bch_bio_max_sectors(bio))
-   goto submit;
-
-   s = mempool_alloc(p-bio_split_hook, GFP_NOIO);
-   closure_init(s-cl, NULL);
-
-   s-bio  = bio;
-   s-p= p;
-   s-bi_end_io= bio-bi_end_io;
-   s-bi_private   = bio-bi_private;
-   bio_get(bio);
-
-   do {
-   n = bio_next_split(bio, bch_bio_max_sectors(bio),
-  GFP_NOIO, s-p-bio_split

[PATCH 7/9] blk-lib.c: generic_make_request() handles large bios now

2013-11-04 Thread Kent Overstreet
generic_make_request() will now do for us what the code in blk-lib.c was
doing manually, with the bio_batch stuff - we still need some looping in
case we're trying to discard/zeroout more than around a gigabyte, but
when we can submit that much at a time doing the submissions in parallel
really shouldn't matter.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 block/blk-lib.c | 175 ++--
 1 file changed, 30 insertions(+), 145 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2da76c9..368c36a 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,23 +9,6 @@
 
 #include blk.h
 
-struct bio_batch {
-   atomic_tdone;
-   unsigned long   flags;
-   struct completion   *wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-   struct bio_batch *bb = bio-bi_private;
-
-   if (err  (err != -EOPNOTSUPP))
-   clear_bit(BIO_UPTODATE, bb-flags);
-   if (atomic_dec_and_test(bb-done))
-   complete(bb-wait);
-   bio_put(bio);
-}
-
 /**
  * blkdev_issue_discard - queue a discard
  * @bdev:  blockdev to issue discard for
@@ -40,15 +23,10 @@ static void bio_batch_end_io(struct bio *bio, int err)
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
-   DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
-   unsigned int max_discard_sectors, granularity;
-   int alignment;
-   struct bio_batch bb;
struct bio *bio;
int ret = 0;
-   struct blk_plug plug;
 
if (!q)
return -ENXIO;
@@ -56,78 +34,28 @@ int blkdev_issue_discard(struct block_device *bdev, 
sector_t sector,
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
 
-   /* Zero-sector (unknown) and one-sector granularities are the same.  */
-   granularity = max(q-limits.discard_granularity  9, 1U);
-   alignment = (bdev_discard_alignment(bdev)  9) % granularity;
-
-   /*
-* Ensure that max_discard_sectors is of the proper
-* granularity, so that requests stay aligned after a split.
-*/
-   max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX  9);
-   max_discard_sectors -= max_discard_sectors % granularity;
-   if (unlikely(!max_discard_sectors)) {
-   /* Avoid infinite loop below. Being cautious never hurts. */
-   return -EOPNOTSUPP;
-   }
-
if (flags  BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
type |= REQ_SECURE;
}
 
-   atomic_set(bb.done, 1);
-   bb.flags = 1  BIO_UPTODATE;
-   bb.wait = wait;
-
-   blk_start_plug(plug);
while (nr_sects) {
-   unsigned int req_sects;
-   sector_t end_sect, tmp;
-
bio = bio_alloc(gfp_mask, 1);
-   if (!bio) {
-   ret = -ENOMEM;
-   break;
-   }
-
-   req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
-
-   /*
-* If splitting a request, and the next starting sector would be
-* misaligned, stop the discard at the previous aligned sector.
-*/
-   end_sect = sector + req_sects;
-   tmp = end_sect;
-   if (req_sects  nr_sects 
-   sector_div(tmp, granularity) != alignment) {
-   end_sect = end_sect - alignment;
-   sector_div(end_sect, granularity);
-   end_sect = end_sect * granularity + alignment;
-   req_sects = end_sect - sector;
-   }
+   if (!bio)
+   return -ENOMEM;
 
-   bio-bi_iter.bi_sector = sector;
-   bio-bi_end_io = bio_batch_end_io;
bio-bi_bdev = bdev;
-   bio-bi_private = bb;
+   bio-bi_iter.bi_sector = sector;
+   bio-bi_iter.bi_size = min_t(sector_t, nr_sects, 1  20)  9;
 
-   bio-bi_iter.bi_size = req_sects  9;
-   nr_sects -= req_sects;
-   sector = end_sect;
+   sector += bio_sectors(bio);
+   nr_sects -= bio_sectors(bio);
 
-   atomic_inc(bb.done);
-   submit_bio(type, bio);
+   ret = submit_bio_wait(type, bio);
+   if (ret)
+   break;
}
-   blk_finish_plug(plug);
-
-   /* Wait for bios in-flight */
-   if (!atomic_dec_and_test(bb.done))
-   wait_for_completion_io(wait);
-
-   if (!test_bit(BIO_UPTODATE, bb.flags))
-   ret = -EIO;
 
return

[PATCH 6/9] mtip32xx: handle arbitrary size bios

2013-11-04 Thread Kent Overstreet
We get a measurable performance increase by handling this in the driver when
we're already looping over the biovec, instead of handling it separately in
generic_make_request() (or bio_add_page() originally)

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/block/mtip32xx/mtip32xx.c | 46 +--
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index d4c669b..c5a7a96 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2648,24 +2648,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, 
sector_t sector,
 }
 
 /*
- * Release a command slot.
- *
- * @dd  Pointer to the driver data structure.
- * @tag Slot tag
- *
- * return value
- *  None
- */
-static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
-   int unaligned)
-{
-   struct semaphore *sem = unaligned ? dd-port-cmd_slot_unal :
-   dd-port-cmd_slot;
-   release_slot(dd-port, tag);
-   up(sem);
-}
-
-/*
  * Obtain a command slot and return its associated scatter list.
  *
  * @dd  Pointer to the driver data structure.
@@ -4016,21 +3998,22 @@ static void mtip_make_request(struct request_queue 
*queue, struct bio *bio)
 
sg = mtip_hw_get_scatterlist(dd, tag, unaligned);
if (likely(sg != NULL)) {
-   if (unlikely((bio)-bi_vcnt  MTIP_MAX_SG)) {
-   dev_warn(dd-pdev-dev,
-   Maximum number of SGL entries exceeded\n);
-   bio_io_error(bio);
-   mtip_hw_release_scatterlist(dd, tag, unaligned);
-   return;
-   }
-
/* Create the scatter list for this bio. */
bio_for_each_segment(bvec, bio, iter) {
-   sg_set_page(sg[nents],
-   bvec.bv_page,
-   bvec.bv_len,
-   bvec.bv_offset);
-   nents++;
+   if (unlikely(nents == MTIP_MAX_SG)) {
+   struct bio *split = bio_clone(bio, GFP_NOIO);
+
+   split-bi_iter = iter;
+   bio-bi_iter.bi_size -= iter.bi_size;
+   bio_chain(split, bio);
+   generic_make_request(split);
+   break;
+   }
+
+   sg_set_page(sg[nents++],
+   bvec.bv_page,
+   bvec.bv_len,
+   bvec.bv_offset);
}
 
/* Issue the read/write. */
@@ -4145,6 +4128,7 @@ skip_create_disk:
blk_queue_max_hw_sectors(dd-queue, 0x);
blk_queue_max_segment_size(dd-queue, 0x40);
blk_queue_io_min(dd-queue, 4096);
+   set_bit(QUEUE_FLAG_LARGEBIOS,   dd-queue-queue_flags);
 
/*
 * write back cache is not supported in the device. FUA depends on
-- 
1.8.4.rc3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/9] block: Convert various code to bio_for_each_segment()

2013-11-04 Thread Kent Overstreet
With immutable biovecs we don't want code accessing bi_io_vec directly -
the uses this patch changes weren't incorrect since they all own the
bio, but it makes the code harder to audit for no good reason - also,
this will help with multipage bvecs later.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Alexander Viro v...@zeniv.linux.org.uk
Cc: Chris Mason chris.ma...@fusionio.com
Cc: Jaegeuk Kim jaegeuk@samsung.com
Cc: Joern Engel jo...@logfs.org
Cc: Prasad Joshi prasadjoshi.li...@gmail.com
Cc: Trond Myklebust trond.mykleb...@netapp.com
---
 fs/btrfs/compression.c   | 10 --
 fs/btrfs/disk-io.c   | 11 ---
 fs/btrfs/extent_io.c | 37 ++---
 fs/btrfs/inode.c | 15 ++-
 fs/f2fs/data.c   | 13 +
 fs/f2fs/segment.c| 12 +---
 fs/logfs/dev_bdev.c  | 18 +++---
 fs/mpage.c   | 17 -
 fs/nfs/blocklayout/blocklayout.c | 34 +-
 9 files changed, 66 insertions(+), 101 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 06ab821..52e7848 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -203,18 +203,16 @@ csum_failed:
if (cb-errors) {
bio_io_error(cb-orig_bio);
} else {
-   int bio_index = 0;
-   struct bio_vec *bvec = cb-orig_bio-bi_io_vec;
+   int i;
+   struct bio_vec *bvec;
 
/*
 * we have verified the checksum already, set page
 * checked so the end_io handlers know about it
 */
-   while (bio_index  cb-orig_bio-bi_vcnt) {
+   bio_for_each_segment_all(bvec, cb-orig_bio, i)
SetPageChecked(bvec-bv_page);
-   bvec++;
-   bio_index++;
-   }
+
bio_endio(cb-orig_bio, 0);
}
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62176ad..733182e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -850,20 +850,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, 
struct inode *inode,
 
 static int btree_csum_one_bio(struct bio *bio)
 {
-   struct bio_vec *bvec = bio-bi_io_vec;
-   int bio_index = 0;
+   struct bio_vec *bvec;
struct btrfs_root *root;
-   int ret = 0;
+   int i, ret = 0;
 
-   WARN_ON(bio-bi_vcnt = 0);
-   while (bio_index  bio-bi_vcnt) {
+   bio_for_each_segment_all(bvec, bio, i) {
root = BTRFS_I(bvec-bv_page-mapping-host)-root;
ret = csum_dirty_buffer(root, bvec-bv_page);
if (ret)
break;
-   bio_index++;
-   bvec++;
}
+
return ret;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0df176a..ea5a08b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2014,7 +2014,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 
start,
}
bio-bi_bdev = dev-bdev;
bio_add_page(bio, page, length, start - page_offset(page));
-   btrfsic_submit_bio(WRITE_SYNC, bio);
+   btrfsic_submit_bio(WRITE_SYNC, bio); /* XXX: submit_bio_wait() */
wait_for_completion(compl);
 
if (!test_bit(BIO_UPTODATE, bio-bi_flags)) {
@@ -2340,12 +2340,13 @@ int end_extent_writepage(struct page *page, int err, 
u64 start, u64 end)
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
-   struct bio_vec *bvec = bio-bi_io_vec + bio-bi_vcnt - 1;
+   struct bio_vec *bvec;
struct extent_io_tree *tree;
u64 start;
u64 end;
+   int i;
 
-   do {
+   bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec-bv_page;
tree = BTRFS_I(page-mapping-host)-io_tree;
 
@@ -2363,14 +2364,11 @@ static void end_bio_extent_writepage(struct bio *bio, 
int err)
start = page_offset(page);
end = start + bvec-bv_offset + bvec-bv_len - 1;
 
-   if (--bvec = bio-bi_io_vec)
-   prefetchw(bvec-bv_page-flags);
-
if (end_extent_writepage(page, err, start, end))
continue;
 
end_page_writeback(page);
-   } while (bvec = bio-bi_io_vec);
+   }
 
bio_put(bio);
 }
@@ -2400,9 +2398,8 @@ endio_readpage_release_extent(struct extent_io_tree 
*tree, u64 start, u64 len,
  */
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
+   struct bio_vec *bvec;
int uptodate = test_bit(BIO_UPTODATE, bio-bi_flags);
-   struct bio_vec *bvec_end = bio-bi_io_vec + bio-bi_vcnt - 1;
-   struct bio_vec *bvec = bio-bi_io_vec;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct

[PATCH 3/9] block: Move bouncing to generic_make_request()

2013-11-04 Thread Kent Overstreet
Next patch is going to make generic_make_request() handle arbitrary
sized bios by splitting them if necessary. It makes more sense to call
blk_queue_bounce() first, partly so it's working on larger bios - but also the
code that splits bios, and __blk_recalc_rq_segments(), won't have to take into
account bouncing (as it'll already have been done).

Also, __blk_recalc_rq_segments() now doesn't have to take into account
potential bouncing - it's already been done.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Jiri Kosina jkos...@suse.cz
Cc: Asai Thambi S P asamymuth...@micron.com
---
 block/blk-core.c  | 14 +++---
 block/blk-merge.c | 13 -
 drivers/block/mtip32xx/mtip32xx.c |  2 --
 drivers/block/pktcdvd.c   |  2 --
 4 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d9cab97..3c7467e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1466,13 +1466,6 @@ void blk_queue_bio(struct request_queue *q, struct bio 
*bio)
struct request *req;
unsigned int request_count = 0;
 
-   /*
-* low level driver can indicate that it wants pages above a
-* certain limit bounced to low memory (ie for highmem, or even
-* ISA dma in theory)
-*/
-   blk_queue_bounce(q, bio);
-
if (bio_integrity_enabled(bio)  bio_integrity_prep(bio)) {
bio_endio(bio, -EIO);
return;
@@ -1822,6 +1815,13 @@ void generic_make_request(struct bio *bio)
do {
struct request_queue *q = bdev_get_queue(bio-bi_bdev);
 
+   /*
+* low level driver can indicate that it wants pages above a
+* certain limit bounced to low memory (ie for highmem, or even
+* ISA dma in theory)
+*/
+   blk_queue_bounce(q, bio);
+
q-make_request_fn(q, bio);
 
bio = bio_list_pop(current-bio_list);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 953b8df..9680ec73 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 struct bio *bio)
 {
struct bio_vec bv, bvprv = { NULL };
-   int cluster, high, highprv = 1;
+   int cluster, prev = 0;
unsigned int seg_size, nr_phys_segs;
struct bio *fbio, *bbio;
struct bvec_iter iter;
@@ -27,13 +27,7 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_segment(bv, bio, iter) {
-   /*
-* the trick here is making sure that a high page is
-* never considered part of another segment, since that
-* might change with the bounce page.
-*/
-   high = page_to_pfn(bv.bv_page)  queue_bounce_pfn(q);
-   if (!high  !highprv  cluster) {
+   if (prev  cluster) {
if (seg_size + bv.bv_len
 queue_max_segment_size(q))
goto new_segment;
@@ -44,6 +38,7 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 
seg_size += bv.bv_len;
bvprv = bv;
+   prev = 1;
continue;
}
 new_segment:
@@ -53,8 +48,8 @@ new_segment:
 
nr_phys_segs++;
bvprv = bv;
+   prev = 1;
seg_size = bv.bv_len;
-   highprv = high;
}
bbio = bio;
}
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 52b2f2a..d4c669b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4016,8 +4016,6 @@ static void mtip_make_request(struct request_queue 
*queue, struct bio *bio)
 
sg = mtip_hw_get_scatterlist(dd, tag, unaligned);
if (likely(sg != NULL)) {
-   blk_queue_bounce(queue, bio);
-
if (unlikely((bio)-bi_vcnt  MTIP_MAX_SG)) {
dev_warn(dd-pdev-dev,
Maximum number of SGL entries exceeded\n);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 1bf1f22..7991cc8 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2486,8 +2486,6 @@ static void pkt_make_request(struct request_queue *q, 
struct bio *bio)
goto end_io;
}
 
-   blk_queue_bounce(q, bio);
-
do {
sector_t zone = get_zone(bio-bi_iter.bi_sector

[PATCH 4/9] block: Make generic_make_request handle arbitrary sized bios

2013-11-04 Thread Kent Overstreet
The way the block layer is currently written, it goes to great lengths
to avoid having to split bios; upper layer code (such as bio_add_page())
checks what the underlying device can handle and tries to always create
bios that don't need to be split.

But this approach becomes unwieldy and eventually breaks down with
stacked devices and devices with dynamic limits, and it adds a lot of
complexity. If the block layer could split bios as needed, we could
eliminate a lot of complexity elsewhere - particularly in stacked
drivers. Code that creates bios can then create whatever size bios are
convenient, and more importantly stacked drivers don't have to deal with
both their own bio size limitations and the limitations of the
(potentially multiple) devices underneath them.

In the future this will let us delete merge_bvec_fn and a bunch of other code.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Neil Brown ne...@suse.de
Cc: Alasdair Kergon a...@redhat.com
Cc: dm-de...@redhat.com
---
 block/blk-core.c   |  26 ++-
 block/blk-merge.c  | 120 +
 block/blk.h|   3 ++
 include/linux/blkdev.h |   4 ++
 4 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3c7467e..abc5d23 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -566,6 +566,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (q-id  0)
goto fail_c;
 
+   q-bio_split = bioset_create(4, 0);
+   if (!q-bio_split)
+   goto fail_id;
+
q-backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q-backing_dev_info.state = 0;
@@ -575,7 +579,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
err = bdi_init(q-backing_dev_info);
if (err)
-   goto fail_id;
+   goto fail_split;
 
setup_timer(q-backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
@@ -620,6 +624,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
 fail_bdi:
bdi_destroy(q-backing_dev_info);
+fail_split:
+   bioset_free(q-bio_split);
 fail_id:
ida_simple_remove(blk_queue_ida, q-id);
 fail_c:
@@ -1681,15 +1687,6 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
 
-   if (likely(bio_is_rw(bio) 
-  nr_sectors  queue_max_hw_sectors(q))) {
-   printk(KERN_ERR bio too big device %s (%u  %u)\n,
-  bdevname(bio-bi_bdev, b),
-  bio_sectors(bio),
-  queue_max_hw_sectors(q));
-   goto end_io;
-   }
-
part = bio-bi_bdev-bd_part;
if (should_fail_request(part, bio-bi_iter.bi_size) ||
should_fail_request(part_to_disk(part)-part0,
@@ -1814,6 +1811,7 @@ void generic_make_request(struct bio *bio)
current-bio_list = bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio-bi_bdev);
+   struct bio *split = NULL;
 
/*
 * low level driver can indicate that it wants pages above a
@@ -1822,6 +1820,14 @@ void generic_make_request(struct bio *bio)
 */
blk_queue_bounce(q, bio);
 
+   if (!blk_queue_largebios(q))
+   split = blk_bio_segment_split(q, bio, q-bio_split);
+   if (split) {
+   bio_chain(split, bio);
+   bio_list_add(current-bio_list, bio);
+   bio = split;
+   }
+
q-make_request_fn(q, bio);
 
bio = bio_list_pop(current-bio_list);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9680ec73..6e07213 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,126 @@
 
 #include blk.h
 
+static struct bio *blk_bio_discard_split(struct request_queue *q,
+struct bio *bio,
+struct bio_set *bs)
+{
+   unsigned int max_discard_sectors, granularity;
+   int alignment;
+   sector_t tmp;
+   unsigned split_sectors;
+
+   /* Zero-sector (unknown) and one-sector granularities are the same.  */
+   granularity = max(q-limits.discard_granularity  9, 1U);
+
+   max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX  9);
+   max_discard_sectors -= max_discard_sectors % granularity;
+
+   if (unlikely(!max_discard_sectors)) {
+   /* XXX: warn */
+   return NULL;
+   }
+
+   if (bio_sectors(bio) = max_discard_sectors)
+   return NULL;
+
+   split_sectors = max_discard_sectors;
+
+   /*
+* If the next starting sector would be misaligned, stop

[PATCH 5/9] block: Gut bio_add_page(), kill bio_add_pc_page()

2013-11-04 Thread Kent Overstreet
Since generic_make_request() can now handle arbitrary size bios, all we
have to do is make sure the bvec array doesn't overflow.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/scsi/osd/osd_initiator.c   |   5 +-
 drivers/target/target_core_pscsi.c |   5 +-
 fs/bio.c   | 158 +++--
 fs/exofs/ore.c |   8 +-
 fs/exofs/ore_raid.c|   8 +-
 include/linux/bio.h|   2 -
 6 files changed, 37 insertions(+), 149 deletions(-)

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index bac04c2..e52b30d 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1043,7 +1043,6 @@ EXPORT_SYMBOL(osd_req_read_sg);
 static struct bio *_create_sg_bios(struct osd_request *or,
void **buff, const struct osd_sg_entry *sglist, unsigned numentries)
 {
-   struct request_queue *q = osd_request_queue(or-osd_dev);
struct bio *bio;
unsigned i;
 
@@ -1060,9 +1059,9 @@ static struct bio *_create_sg_bios(struct osd_request *or,
unsigned added_len;
 
BUG_ON(offset + len  PAGE_SIZE);
-   added_len = bio_add_pc_page(q, bio, page, len, offset);
+   added_len = bio_add_page(bio, page, len, offset);
if (unlikely(len != added_len)) {
-   OSD_DEBUG(bio_add_pc_page len(%d) != added_len(%d)\n,
+   OSD_DEBUG(bio_add_page len(%d) != added_len(%d)\n,
  len, added_len);
bio_put(bio);
return ERR_PTR(-ENOMEM);
diff --git a/drivers/target/target_core_pscsi.c 
b/drivers/target/target_core_pscsi.c
index 551c96c..d65d512 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -922,12 +922,11 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, 
u32 sgl_nents,
tbio = tbio-bi_next = bio;
}
 
-   pr_debug(PSCSI: Calling bio_add_pc_page() i: %d
+   pr_debug(PSCSI: Calling bio_add_page() i: %d
 bio: %p page: %p len: %d off: %d\n, i, bio,
page, len, off);
 
-   rc = bio_add_pc_page(pdv-pdv_sd-request_queue,
-   bio, page, bytes, off);
+   rc = bio_add_page(bio, page, bytes, off);
if (rc != bytes)
goto fail;
 
diff --git a/fs/bio.c b/fs/bio.c
index 7d538a1..c60bfcb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -665,12 +665,22 @@ int bio_get_nr_vecs(struct block_device *bdev)
 }
 EXPORT_SYMBOL(bio_get_nr_vecs);
 
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned short max_sectors)
+/**
+ * bio_add_page-   attempt to add page to bio
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ */
+int bio_add_page(struct bio *bio, struct page *page,
+unsigned int len, unsigned int offset)
 {
-   int retried_segments = 0;
-   struct bio_vec *bvec;
+   struct bio_vec *bv;
 
/*
 * cloned bio must not modify vec list
@@ -678,41 +688,17 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
 
-   if (((bio-bi_iter.bi_size + len)  9)  max_sectors)
-   return 0;
-
/*
 * For filesystems with a blocksize smaller than the pagesize
 * we will often be called with the same page as last time and
 * a consecutive offset.  Optimize this special case.
 */
if (bio-bi_vcnt  0) {
-   struct bio_vec *prev = bio-bi_io_vec[bio-bi_vcnt - 1];
-
-   if (page == prev-bv_page 
-   offset == prev-bv_offset + prev-bv_len) {
-   unsigned int prev_bv_len = prev-bv_len;
-   prev-bv_len += len;
-
-   if (q-merge_bvec_fn) {
-   struct bvec_merge_data bvm = {
-   /* prev_bvec is already charged in
-  bi_size, discharge it in order to
-  simulate merging updated prev_bvec

Re: [dm-devel] [PATCH 4/9] block: Make generic_make_request handle arbitrary sized bios

2013-11-04 Thread Kent Overstreet
On Mon, Nov 04, 2013 at 03:56:52PM -0800, Mike Christie wrote:
 On 11/04/2013 03:36 PM, Kent Overstreet wrote:
  @@ -1822,6 +1820,14 @@ void generic_make_request(struct bio *bio)
   */
  blk_queue_bounce(q, bio);
   
  +   if (!blk_queue_largebios(q))
  +   split = blk_bio_segment_split(q, bio, q-bio_split);
 
 
 Is it assumed bios coming down this path are created using bio_add_page?
 If not, does blk_bio_segment_split need a queue_max_sectors or
 queue_max_hw_sectors check? I only saw a segment count check below.

Shoot, you're absolutely right - thanks, I'll have this fixed in the next
version.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] block: Revert bio_clone() default behaviour

2013-11-05 Thread Kent Overstreet
This patch reverts the default behaviour introduced by
9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger
shares the source bio's biovec, cloning the biovec is once again the
default.

Instead, we add a new bio_clone_biovec_fast(), which creates a clone
that shares the source's biovec. This patch changes bcache and md to use
__bio_clone_biovec_fast() since they're expecting the new behaviour due
to other refactoring; most of the other uses of bio_clone() should be
same to convert to the _fast() variant but that will be done more
incrementally in other patches (bio_split() in particular).

Note that __bio_clone() isn't being readded - the reason being that with
immutable biovecs allocating the right number of biovecs for the new
clone is no longer trivial so we don't want drivers trying to do that
themselves.

This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 -
__bio_clone_fast() should not be setting bi_vcnt for bios that do not
own the biovec (see Documentation/block/biovecs.txt for rationale) - in
short, not setting it might cause bugs in the short term but long term
it's likely to hide nastier more subtle bugs, we don't want code looking
at bi_vcnt at all for bios it does not own. However, this patch
_shouldn't_ cause any regressions because of this since we're reverting
back to the old bio_clone() behaviour.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Chris Mason chris.ma...@fusionio.com
Cc: Mike Snitzer snit...@redhat.com
Cc: NeilBrown ne...@suse.de
Cc: Olof Johansson o...@lixom.net
---
Chris, Olaf, can you two in particular test this? I have tested the bounce
buffer code (and bcache), but Jens told me today there was an md bug that I
_still_ can't find any emails about so I'm not sure what to test for that.

 drivers/md/bcache/request.c |   6 +--
 drivers/md/dm.c |   4 +-
 fs/bio.c| 104 +++-
 include/linux/bio.h |   6 +--
 mm/bounce.c |   1 -
 5 files changed, 72 insertions(+), 49 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 52a1fef..ef44198 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -681,7 +681,7 @@ static void do_bio_hook(struct search *s)
struct bio *bio = s-bio.bio;
 
bio_init(bio);
-   __bio_clone(bio, s-orig_bio);
+   __bio_clone_fast(bio, s-orig_bio);
bio-bi_end_io  = request_endio;
bio-bi_private = s-cl;
 
@@ -969,8 +969,8 @@ static void request_write(struct cached_dev *dc, struct 
search *s)
trace_bcache_write(s-orig_bio, s-writeback, s-op.skip);
 
if (!s-writeback) {
-   s-op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-  dc-disk.bio_split);
+   s-op.cache_bio = bio_clone_bioset_fast(bio, GFP_NOIO,
+   dc-disk.bio_split);
 
closure_bio_submit(bio, cl, s-d);
} else {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8e6174c..bafe7ed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1135,7 +1135,7 @@ static void clone_bio(struct dm_target_io *tio, struct 
bio *bio,
 {
struct bio *clone = tio-clone;
 
-   __bio_clone(clone, bio);
+   __bio_clone_fast(clone, bio);
 
if (bio_integrity(bio))
bio_integrity_clone(clone, bio, GFP_NOIO);
@@ -1177,7 +1177,7 @@ static void __clone_and_map_simple_bio(struct clone_info 
*ci,
 * ci-bio-bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 * and discard, so no need for concern about wasted bvec allocations.
 */
-__bio_clone(clone, ci-bio);
+__bio_clone_fast(clone, ci-bio);
if (len)
bio_setup_sector(clone, ci-sector, len);
 
diff --git a/fs/bio.c b/fs/bio.c
index 6046c91..99ff176 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -539,15 +539,17 @@ inline int bio_phys_segments(struct request_queue *q, 
struct bio *bio)
 EXPORT_SYMBOL(bio_phys_segments);
 
 /**
- * __bio_clone -   clone a bio
+ * __bio_clone_fast - clone a bio that shares the original bio's biovec
  * @bio: destination bio
  * @bio_src: bio to clone
  *
  * Clone a bio. Caller will own the returned bio, but not
  * the actual data it points to. Reference count of returned
  * bio will be one.
+ *
+ * Caller must ensure that @bio_src is not freed before @bio.
  */
-void __bio_clone(struct bio *bio, struct bio *bio_src)
+void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 {
BUG_ON(bio-bi_pool  BIO_POOL_IDX(bio) != BIO_POOL_NONE);
 
@@ -560,20 +562,19 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
bio-bi_rw = bio_src-bi_rw;
bio-bi_iter = bio_src-bi_iter;
bio-bi_io_vec = bio_src-bi_io_vec;
-   bio-bi_vcnt = bio_src-bi_vcnt

Re: [PATCH] block: Revert bio_clone() default behaviour

2013-11-05 Thread Kent Overstreet
On Tue, Nov 05, 2013 at 09:02:19PM -0800, Olof Johansson wrote:
 Hi,
 
 On Tue, Nov 5, 2013 at 7:48 PM, Kent Overstreet k...@daterainc.com wrote:
 
  Chris, Olaf, can you two in particular test this? I have tested the bounce
  buffer code (and bcache), but Jens told me today there was an md bug that I
  _still_ can't find any emails about so I'm not sure what to test for that.
 
 (I'm guessing you mean me and not some German person here. :-)
 
 What is this expected to apply on top of? It doesn't apply cleanly on
 the 20131105 -next tree which is where I had been seeing the issues.

Sorry - it's on top of Jens' for-3.13/core branch,
git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] block: Revert bio_clone() default behaviour

2013-11-06 Thread Kent Overstreet
On Wed, Nov 06, 2013 at 11:11:30AM -0500, Chris Mason wrote:
 Quoting Kent Overstreet (2013-11-05 22:48:41)
  This patch reverts the default behaviour introduced by
  9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger
  shares the source bio's biovec, cloning the biovec is once again the
  default.
  
  Instead, we add a new bio_clone_biovec_fast(), which creates a clone
  that shares the source's biovec. This patch changes bcache and md to use
^
  dm?
 
  __bio_clone_biovec_fast() since they're expecting the new behaviour due
  to other refactoring; most of the other uses of bio_clone() should be
  same to convert to the _fast() variant but that will be done more
  incrementally in other patches (bio_split() in particular).
 
 Hi Kent,
 
 I noticed yesterday the _fast variants of bio clone introduce sharing
 between the src and the clone, but without any reference counts:
 
 bio-bi_io_vec = bio_src-bi_io_vec;
 
 Have you audited all of the _fast users to make sure they are not
 freeing the src before the clone?  Sorry if this came up already in past
 reviews.

Yup - that should actually be safe for all the existing bio_clone() users
actually, I audited all of them - because normally you're not going to complete
the original bio until the clone finishes.

  Note that __bio_clone() isn't being readded - the reason being that with
  immutable biovecs allocating the right number of biovecs for the new
  clone is no longer trivial so we don't want drivers trying to do that
  themselves.
  
  This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 -
  __bio_clone_fast() should not be setting bi_vcnt for bios that do not
  own the biovec (see Documentation/block/biovecs.txt for rationale) - in
  short,
 
 I think I see what you mean with tying bi_vcnt to ownership of the bio,
 but we're not consistent.  Looking at bio_for_eaach_segment_all:
 
 *
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
  */
 #define bio_for_each_segment_all(bvl, bio, i)   \
 for (i = 0, bvl = (bio)-bi_io_vec; i  (bio)-bi_vcnt; i++, bvl++)
 
 bio_for_each_segment_all still trusts bi_vcnt, so any
 bio_for_each_segment_all operation on a clone will basically be a noop.
 
 Just looking at MD raid1 make_request()
 
 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
   ...
   alloc_behind_pages(mbio, r1_bio); - bio_for_each_segment_all
   ...
   if (r1_bio-behind_bvecs) {
 bio_for_each_segment_all(bvec, mbio, j)
   ...
 
 I didn't test MD without the vcnt fix, but I think any operations in MD
 that duplicate data for raid1 turn into noops.  I think we'll end up
 writing garbage (or nothing) to the second mirror.
 
 If you look at dm's crypt_free_buffer_pages(), it had similar problems.

Those are fine actually - in both cases, they're bios they allocated, not the
bios that were submitted to them. Though md _definitely_ shouldn't have been
sharing the original bio's biovec, so looks like this patch will fix a bug
there...

(I remember seeing that code before and I thought I added a bio_clone_biovec()
call to that md code, but apparently that never got commited. Argh.)

 
  not setting it might cause bugs in the short term but long term
  it's likely to hide nastier more subtle bugs, we don't want code looking
  at bi_vcnt at all for bios it does not own.
 
 I think the concept of bio ownership is still much too weak, at least
 for established users like MD and DM.  I don't know how to verify the
 sharing of bi_io_vec without some kind of reference counting on the
 iovec.

What's unclear about it? The rule is just - if you didn't allocate the biovec,
don't modify it or use bio_for_each_segment_all() (probably I didn't quite state
it clearly enough before though)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: block: Revert bio_clone() default behaviour

2013-11-06 Thread Kent Overstreet
On Wed, Nov 06, 2013 at 03:31:02PM -0500, Mike Snitzer wrote:
 Hey Kent,
 
 Digging a bit in the LKML archive I think this patch is in response to
 this thread: https://lkml.org/lkml/2013/11/6/27

That thread I saw, Jens told me there was another one though

 Might be good to give context for which reported problem(s) are being
 fixed by this patch.
 
 On Tue, Nov 05 2013 at 10:48pm -0500,
 Kent Overstreet k...@daterainc.com wrote:
 
  This patch reverts the default behaviour introduced by
  9fc6286f347d00528adcdcf12396d220f47492ed - bio_clone_biovec() no clonger
  shares the source bio's biovec, cloning the biovec is once again the
  default.
 
 Your focus, in terms of revert, seems to be on restoring
 bio_clone_bioset, so: s/bio_clone_biovec/bio_clone_bioset/
 
 Maybe best to say effectively reverts since you aren't reverting 
 9fc6286f347d00528adcdcf12396d220f47492ed ?  Also s/clonger/longer/ typo ^
 
  Instead, we add a new bio_clone_biovec_fast(), which creates a clone
  that shares the source's biovec. This patch changes bcache and md to use
 
 s/md/dm/

Whoops :p

  __bio_clone_biovec_fast() since they're expecting the new behaviour due
  to other refactoring; most of the other uses of bio_clone() should be
  same to convert to the _fast() variant but that will be done more
 
 s/same/safe/

Thanks

 
  incrementally in other patches (bio_split() in particular).
  
  Note that __bio_clone() isn't being readded - the reason being that with
  immutable biovecs allocating the right number of biovecs for the new
  clone is no longer trivial so we don't want drivers trying to do that
  themselves.
  
  This patch also reverts febca1baea1cfe2d7a0271385d89b03d5fb34f94 -
  __bio_clone_fast() should not be setting bi_vcnt for bios that do not
  own the biovec (see Documentation/block/biovecs.txt for rationale) - in
  short, not setting it might cause bugs in the short term but long term
  it's likely to hide nastier more subtle bugs, we don't want code looking
  at bi_vcnt at all for bios it does not own. However, this patch
  _shouldn't_ cause any regressions because of this since we're reverting
  back to the old bio_clone() behaviour.
  
  Signed-off-by: Kent Overstreet k...@daterainc.com
  Cc: Jens Axboe ax...@kernel.dk
  Cc: Chris Mason chris.ma...@fusionio.com
  Cc: Mike Snitzer snit...@redhat.com
  Cc: NeilBrown ne...@suse.de
  Cc: Olof Johansson o...@lixom.net
  ---
  Chris, Olaf, can you two in particular test this? I have tested the bounce
  buffer code (and bcache), but Jens told me today there was an md bug that I
  _still_ can't find any emails about so I'm not sure what to test for that.
 
 /me assumes you really mean md here, given Chris's later reply in this thread.
 
 Relative to DM, this patch looks fine to me:
 
 Acked-by: Mike Snitzer snit...@redhat.com

Thanks!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] block: Revert bio_clone() default behaviour

2013-11-06 Thread Kent Overstreet
On Wed, Nov 06, 2013 at 03:22:36PM -0500, Chris Mason wrote:
 Quoting Kent Overstreet (2013-11-06 15:02:22)
  Yup - that should actually be safe for all the existing bio_clone() users
  actually, I audited all of them - because normally you're not going to 
  complete
  the original bio until the clone finishes.
 
 I'd say we need an ack from Neil before we can switch to _fast.  The
 completions look non-trivial and _fast adds new ordering requirements on
 free.

Agreed, I've already broken md enough lately :P

(I'm started to work on a real test suite for the block layer, that'll help with
this stuff...)

   If you look at dm's crypt_free_buffer_pages(), it had similar problems.
  
  Those are fine actually - in both cases, they're bios they allocated, not 
  the
  bios that were submitted to them.
 
 Both are expecting bi_vcnt to be non-zero after a clone, which wasn't
 the case before the bi_vcnt patch.  With your current patch (adding
 _fast) both should now be correct.

Yeah, but bi_vcnt being zero was just a symptom - the underlying reason the old
code was wrong was that with the clone sharing the parent's biovec, md was
modifying a biovec (including bv_page!) that it didn't own - _that_ was what was
horribly broken about it.

  What's unclear about it? The rule is just - if you didn't allocate the 
  biovec,
  don't modify it or use bio_for_each_segment_all() (probably I didn't quite 
  state
  it clearly enough before though)
 
 That part makes sense.  The new rule that scares me is that we can't
 free the src of the clone until all the clones are freed.  If it works
 with today's existing users it feels like it is more by accident than
 design.  I'm not saying we can't do it, we just need some bigger
 flashing warning lights.

Yeah, Jens was saying the same thing yesterday, hence this patch.

OTOH - with regards to just the ordering requirements, the more I look at
various code the less accidental the fact that that works seems to be: the best
explanation I've come up with so far is that you already needed to ensure that
the _pages_ the clone points to stick around until the clone completes, and if
you don't own the original bio the only way to do that is to not complete the
original bio until after the clone completes.

So if you're a driver cloning bios that were submitted to you, bio_clone_fast()
introduces no new ordering requirements.

On the third hand - if you're cloning (i.e. splitting) your own bios, in that
case it would be possible to screw up the ordering - I don't know of any code in
the kernel that does this today (except for, sort of, bcache) but my dio rewrite
takes this approach - but if you do the obvious and sane thing with bio_chain()
it's a non issue, it seems to me you'd have to come up with something pretty
contrived and dumb for this to actually be an issue in practice.

Anyways, I haven't come to any definite conclusions, those are just my
observations so far.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] block/s390: Fixes for Convert bio_for_each_segment() to bvec_iter

2013-11-06 Thread Kent Overstreet
5d1f127c3e0c57d64ce75ee04a0db2b40a3e21df was incomplete - this just converts
s390's drivers.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
I was poking around a bit, and it looks like there's nothing really preventing
these drivers from building on non s390 except that a bunch of headers are in
arch/s390/include. Would be nice to get that fixed so we can get better build
coverage.

 drivers/s390/block/dasd_diag.c   | 10 
 drivers/s390/block/dasd_eckd.c   | 48 ++--
 drivers/s390/block/dasd_fba.c| 26 +--
 drivers/s390/block/scm_blk.c |  8 +++---
 drivers/s390/block/scm_blk_cluster.c |  4 +--
 5 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index 92bd22c..9cbc567 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -504,7 +504,7 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct 
dasd_device *memdev,
struct dasd_diag_req *dreq;
struct dasd_diag_bio *dbio;
struct req_iterator iter;
-   struct bio_vec *bv;
+   struct bio_vec bv;
char *dst;
unsigned int count, datasize;
sector_t recid, first_rec, last_rec;
@@ -525,10 +525,10 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct 
dasd_device *memdev,
/* Check struct bio and count the number of blocks for the request. */
count = 0;
rq_for_each_segment(bv, req, iter) {
-   if (bv-bv_len  (blksize - 1))
+   if (bv.bv_len  (blksize - 1))
/* Fba can only do full blocks. */
return ERR_PTR(-EINVAL);
-   count += bv-bv_len  (block-s2b_shift + 9);
+   count += bv.bv_len  (block-s2b_shift + 9);
}
/* Paranoia. */
if (count != last_rec - first_rec + 1)
@@ -545,8 +545,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct 
dasd_device *memdev,
dbio = dreq-bio;
recid = first_rec;
rq_for_each_segment(bv, req, iter) {
-   dst = page_address(bv-bv_page) + bv-bv_offset;
-   for (off = 0; off  bv-bv_len; off += blksize) {
+   dst = page_address(bv.bv_page) + bv.bv_offset;
+   for (off = 0; off  bv.bv_len; off += blksize) {
memset(dbio, 0, sizeof (struct dasd_diag_bio));
dbio-type = rw_cmd;
dbio-block_number = recid + 1;
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 5adb204..030d6e0 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -2507,7 +2507,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single(
struct dasd_ccw_req *cqr;
struct ccw1 *ccw;
struct req_iterator iter;
-   struct bio_vec *bv;
+   struct bio_vec bv;
char *dst;
unsigned int off;
int count, cidaw, cplength, datasize;
@@ -2529,13 +2529,13 @@ static struct dasd_ccw_req 
*dasd_eckd_build_cp_cmd_single(
count = 0;
cidaw = 0;
rq_for_each_segment(bv, req, iter) {
-   if (bv-bv_len  (blksize - 1))
+   if (bv.bv_len  (blksize - 1))
/* Eckd can only do full blocks. */
return ERR_PTR(-EINVAL);
-   count += bv-bv_len  (block-s2b_shift + 9);
+   count += bv.bv_len  (block-s2b_shift + 9);
 #if defined(CONFIG_64BIT)
-   if (idal_is_needed (page_address(bv-bv_page), bv-bv_len))
-   cidaw += bv-bv_len  (block-s2b_shift + 9);
+   if (idal_is_needed (page_address(bv.bv_page), bv.bv_len))
+   cidaw += bv.bv_len  (block-s2b_shift + 9);
 #endif
}
/* Paranoia. */
@@ -2606,16 +2606,16 @@ static struct dasd_ccw_req 
*dasd_eckd_build_cp_cmd_single(
  last_rec - recid + 1, cmd, basedev, blksize);
}
rq_for_each_segment(bv, req, iter) {
-   dst = page_address(bv-bv_page) + bv-bv_offset;
+   dst = page_address(bv.bv_page) + bv.bv_offset;
if (dasd_page_cache) {
char *copy = kmem_cache_alloc(dasd_page_cache,
  GFP_DMA | __GFP_NOWARN);
if (copy  rq_data_dir(req) == WRITE)
-   memcpy(copy + bv-bv_offset, dst, bv-bv_len);
+   memcpy(copy + bv.bv_offset, dst, bv.bv_len);
if (copy)
-   dst = copy + bv-bv_offset;
+   dst = copy + bv.bv_offset;
}
-   for (off = 0; off  bv-bv_len; off += blksize) {
+   for (off = 0; off  bv.bv_len; off += blksize) {
sector_t trkid = recid

Re: [PATCH] block: Revert bio_clone() default behaviour

2013-11-06 Thread Kent Overstreet
On Wed, Nov 06, 2013 at 04:25:45PM -0500, Chris Mason wrote:
 Quoting Kent Overstreet (2013-11-06 15:57:34)
  On Wed, Nov 06, 2013 at 03:22:36PM -0500, Chris Mason wrote:
   Quoting Kent Overstreet (2013-11-06 15:02:22)
 
 [ ... nods, thanks! ... ]
 
  OTOH - with regards to just the ordering requirements, the more I look at
  various code the less accidental the fact that that works seems to be: the 
  best
  explanation I've come up with so far is that you already needed to ensure 
  that
  the _pages_ the clone points to stick around until the clone completes, and 
  if
  you don't own the original bio the only way to do that is to not complete 
  the
  original bio until after the clone completes.
  
  So if you're a driver cloning bios that were submitted to you, 
  bio_clone_fast()
  introduces no new ordering requirements.
  
  On the third hand - if you're cloning (i.e. splitting) your own bios, in 
  that
  case it would be possible to screw up the ordering - I don't know of any 
  code in
  the kernel that does this today (except for, sort of, bcache) but my dio 
  rewrite
  takes this approach - but if you do the obvious and sane thing with 
  bio_chain()
  it's a non issue, it seems to me you'd have to come up with something pretty
  contrived and dumb for this to actually be an issue in practice.
  
  Anyways, I haven't come to any definite conclusions, those are just my
  observations so far.
 
 I do think you're right.  We all seem to have clones doing work on
 behalf of the original, and when everyone is done we complete the
 original.
 
 But, btrfs does have silly things like this:
 
 dio_end_io(dio_bio, err); // end and free the original
   bio_put(bio); // free the clone
 
 It's not a bug yet, but given enough time the space between those two
 frees will grow new code that kills us all.

Hopefully the DIO situation will get better in the near future, Josef was
telling me btrfs was probably going to end up with its own DIO code (replacing a
good chunk of direct-io.c) and that should get a lot easier and saner for you
guys with my dio rewrite. Need to finish getting that to pass xfstests...

 Really though, the new stuff is better, thanks.

Thanks! :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-07 Thread Kent Overstreet
On Thu, Nov 07, 2013 at 11:17:22AM -0800, Olof Johansson wrote:
 On Sat, Nov 2, 2013 at 1:50 PM, Dave Kleikamp dave.kleik...@oracle.com 
 wrote:
  On 11/01/2013 03:53 PM, Jens Axboe wrote:
  On 11/01/2013 02:41 PM, Dave Kleikamp wrote:
  On 11/01/2013 03:27 PM, Jens Axboe wrote:
  On 11/01/2013 02:22 PM, Stephen Rothwell wrote:
  Hi Jens,
 
  On Fri, 01 Nov 2013 09:10:43 -0600 Jens Axboe ax...@kernel.dk wrote:
 
  On 10/31/2013 09:20 PM, Stephen Rothwell wrote:
 
  Today's linux-next merge of the block tree got a conflict in
  drivers/block/loop.c between commit 2486740b52fd (loop: use aio to
  perform io on the underlying file) from the aio-direct tree and 
  commit
  ed2d2f9a8265 (block: Abstract out bvec iterator) from the block 
  tree.
 
  I fixed it up (I think - see below - I have also attached the final
  resulting file) and can carry the fix as necessary (no action is
  required).
 
 
  What tree is this from? It'd be a lot more convenient to fold that loop
  patch into my tree, especially since the block tree in linux-next 
  failed
  after this merge.
 
  I can only agree with you.  It is from the aio-direct tree (probably
  misnamed by me) (git://github.com/kleikamp/linux-shaggy.git#for-next) 
  run
  by Dave Kleikamp.
 
  Dave, input requested.
 
  In any case, I would suggest dropping the aio-direct tree instead of the
  entire block tree for coverage purposes, if merge or build failures
  happen because of it.
 
  I've had these patches in linux-next since August, and I'd really like
  to push them in the 3.13 merge window.
 
  Are there other problems besides this merge issue? I'll take a closer
  look at Stephen's merge patch and see if I find any other issues, but I
  really don't want to pull these patches out of linux-next now.
 
  I'm not saying that the patches should be dropped or not go into 3.13.
  What I'm saying is that if the choice is between having the bio and
  blk-mq stuff in linux-next or an addon to the loop driver, the decision
  should be quite clear.
 
  So we've three immediate options:
 
  1) You base it on top of the block tree
  2) I carry the loop updates
  3) You hand Stephen a merge patch for the resulting merge of the two
 
  Attached is a merge patch and the merged loop.c. I'm having problems
  with the loop driver with both the block and my tree. I'll continue to
  look at that, but everything should build cleanly with this.
 
 Hijacking(?) this thread since it seems relevant:
 
 I noticed the following panic on a chromebox with last night's next.
 20131106 shows it as well. I didn't go back further to see. 3.12 runs
 fine.
 
 I bisected it down, and unfortunately it points at Stephen's merge commit:
 
 commit 3caa8f38e7eeb56c7d48b0d5c323ffbf4939635d
 Merge: 447b374 bb6f7be
 Author: Stephen Rothwell s...@canb.auug.org.au
 Date:   Thu Nov 7 14:07:20 2013 +1100
 
 Merge remote-tracking branch 'aio-direct/for-next'
 
 Conflicts:
 drivers/block/loop.c
 fs/nfs/direct.c
 fs/nfs/file.c
 include/linux/blk_types.h
 
 
 ... but the branch alone runs fine.
 
 Context to the failure: Userspace is already up and running. ChromeOS
 will do ecryptfs and loopback mounts, etc, which is likely where this
 is hitting given the process running. It definitely happens during
 early userspace setup.

That looks like the bi_remaining BUG_ON() in bio_endio(), probably
related to the loopback driver. I'll start looking at the code soon as I
get into the office, this one should be easy to track down.

 
 Seems like we might be in for a bumpy ride in 3.13 w.r.t. block if the
 breakage we've found this week in -next is any indication.
 
 This seems to be reliably reproducing for me so I can help collect
 data if needed, Dave/Jens.
 
 [3.373979] EXT4-fs (sda1): mounted filesystem with ordered data
 mode. Opts: commit=600
 [3.385719] EXT4-fs (sda8): mounted filesystem with ordered data
 mode. Opts: commit=600
 [3.475540] bio: create slab bio-1 at 1
 [3.483577] EXT4-fs (dm-0): mounted filesystem with ordered data
 mode. Opts: discard,commit=600
 [3.556890] EXT4-fs (sda1): re-mounted. Opts: commit=600,data=ordered
 [3.636658] [ cut here ]
 [3.641345] kernel BUG at
 /mnt/host/source/src/third_party/kernel-next/fs/bio.c:1725!
 [3.649266] invalid opcode:  [#1] SMP
 [3.653473] Modules linked in:
 [3.656610] CPU: 0 PID: 107 Comm: loop0 Tainted: GW
 3.12.0-next-20131107 #6
 [3.664645] Hardware name: SAMSUNG Stumpy, BIOS
 Google_Stumpy.2183.0.2012_05_01_1303 05/01/2012
 [3.673463] task: 88010001e250 ti: 880074c7e000 task.ti:
 880074c7e000
 [3.681023] RIP: 0010:[8111229d]  [8111229d]
 bio_endio+0x13/0x59
 [3.67] RSP: 0018:880074c7fc50  EFLAGS: 00010246
 [3.694272] RAX:  RBX: 880074cb6120 RCX: 
 
 [3.701496] RDX: fffb RSI: fffb RDI: 
 880074c4b000
 [3.708728] 

Re: linux-next: manual merge of the block tree with the tree

2013-11-07 Thread Kent Overstreet
On Thu, Nov 07, 2013 at 01:20:26PM -0600, Dave Kleikamp wrote:
 On 11/02/2013 03:50 PM, Dave Kleikamp wrote:
  On 11/01/2013 03:53 PM, Jens Axboe wrote:
 
  So we've three immediate options:
 
  1) You base it on top of the block tree
  2) I carry the loop updates
  3) You hand Stephen a merge patch for the resulting merge of the two
  
  Attached is a merge patch and the merged loop.c. I'm having problems
  with the loop driver with both the block and my tree. I'll continue to
  look at that, but everything should build cleanly with this.
 
 Looking back, I obviously rushed the last patch out. This merge patch,
 and the resulting loop.c, fix my problem. My code is working with Jens'
 block tree now.
 
 Jens,
 I ended up replacing a call to bio_iovec_idx() with __bvec_iter_bvec()
 since the former was removed. It's not very elegant, but it works. I'm
 open to suggestions on a cleaner fix, but it can wait until one or both
 of these trees is merged.

No, that's definitely wrong. Read Documentation/block/biovecs.txt - you
need to use either the new bio_iovec() or bio_iovec() iter. I can do the
conversion later today.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/9] block: Convert various code to bio_for_each_segment()

2013-11-07 Thread Kent Overstreet
On Thu, Nov 07, 2013 at 12:26:30PM +0100, Jan Kara wrote:
 On Mon 04-11-13 15:36:19, Kent Overstreet wrote:
  With immutable biovecs we don't want code accessing bi_io_vec directly -
  the uses this patch changes weren't incorrect since they all own the
  bio, but it makes the code harder to audit for no good reason - also,
  this will help with multipage bvecs later.
   I think you've missed the code in fs/ext4/page-io.c in the conversion
 (likely because it was added relatively recently).

Thanks!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-07 Thread Kent Overstreet
On Fri, Nov 08, 2013 at 12:53:07PM +1100, Stephen Rothwell wrote:
 Hi all,
 
 On Thu, 07 Nov 2013 18:04:57 -0600 Dave Kleikamp dave.kleik...@oracle.com 
 wrote:
 
  Can you please drop the aio-direct tree for the time being?
 
 OK, I was afraid of this, but, yes, I can drop it.  I am not quite sure
 what affect this will have on Andrew's tree, though (hopefully not too
 much).
 
 This is a bit disappointing.  Dave's stuff have been sitting in
 linux-next for quite some time (probably too long) so it is not as if
 Kent and Jens (should) have been unaware of it.

Yeah, I have to apologize for not noticing sooner. Mea culpa. That said, we
spent some time hashing things out on IRC and I'm pretty excited for the new
plan; Zach Brown also had some good input.

  My stuff is not aligning very well with the immutable biovecs and Kent
  has a different approach in mind. I'll be working with him on a
  replacement that will hopefully be simpler.
 
 Presumably not before v3.14, right?

Yeah, the gist of it is that loopback driver is going to be passing bios
directly to the dio code - my dio rewrite gets us most of the way there, and
after immutable biovecs there's only ~3 (much simpler) patches left that the dio
rewrite depends on.

The remaining issue to deal with is the fact that on writes, the dio code bails
out if it hits an unmapped block (i.e. a hole) and filemap.c finishes it up with
the buffered io code - we need the dio code to handle that in a self contained
fashion. Zach had a (horribly ugly, but definitely workable) idea for that, so
I'm feeling pretty optimistic about this right now.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-07 Thread Kent Overstreet
On Thu, Nov 07, 2013 at 11:33:24PM -0800, Christoph Hellwig wrote:
 The changes for direct I/O from kernel space have been in for a long
 time, and they are blocking multiple consumers of the interface from
 getting submitted for about a year now.  Even if the guts of the
 direct-io code will get a write based on another patchset from Kent
 that will go on top of the immutable iovec changes we need the
 interfaces now and not another year down the road.

What else is blocked on this patch series? Honest question.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-07 Thread Kent Overstreet
On Thu, Nov 07, 2013 at 11:44:45PM -0800, Christoph Hellwig wrote:
 On Thu, Nov 07, 2013 at 11:39:59PM -0800, Kent Overstreet wrote:
  On Thu, Nov 07, 2013 at 11:33:24PM -0800, Christoph Hellwig wrote:
   The changes for direct I/O from kernel space have been in for a long
   time, and they are blocking multiple consumers of the interface from
   getting submitted for about a year now.  Even if the guts of the
   direct-io code will get a write based on another patchset from Kent
   that will go on top of the immutable iovec changes we need the
   interfaces now and not another year down the road.
  
  What else is blocked on this patch series? Honest question.
 
 
 From me alone:
   Support for ecryptfs to not double cache.
   AIO support for target_core_file.
 
 To replace code in staging:
   Removal of the lustre-specific loop driver.
 
 And I remember others claiming to want to submit bits, too.

So, I don't think the iov_iter stuff is the right approach for solving
the loop issue; it's an ugly hack and after immutable biovecs we're
pretty close to a better solution and some major cleanups too.

I don't know about ecryptfs and AIO for target, though - are there
patches for those you could point me at so I can have a look? I can
believe the iov_iter stuff is necessary for those, but I'd like to
convince myself there isn't a cleaner solution.

Regardless, I don't want to be blocking anyone else's work; if we do
want the iov_iter stuff in now (I'm not arguing one way or the other
till I look at the issues you pointed out) I can write a small patch to
correctly merge with immutable bvecs; I looked at it today and it's
pretty straightforward (if somewhat ugly).
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-08 Thread Kent Overstreet
On Fri, Nov 08, 2013 at 12:02:21AM -0800, Christoph Hellwig wrote:
 On Thu, Nov 07, 2013 at 11:56:17PM -0800, Kent Overstreet wrote:
  So, I don't think the iov_iter stuff is the right approach for solving
  the loop issue; it's an ugly hack and after immutable biovecs we're
  pretty close to a better solution and some major cleanups too.
 
 All the consumers aren't limited to a block-based filesystem backing,
 including loop.  So we need a file-ops based approach for in-kernel
 dio/aio.  If you have a counter proposal please at least describe it.

The core issue isn't whether the IO is going to a block based filesystem
(but thanks for pointing out that that's not necessarily true!) but
whether we want to work with pinned pages or not. If pinned pages are ok
for everything, then bios as a common interface work - likely evolving
them a bit to be more general (it's just bi_bdev and bi_sector that's
actually block specific) - and IMO that would be far preferable to this
abstraction layer.

If OTOH we need a common interface that's also for places where we can't
afford the overhead of pinning user pages - that's a different story,
and maybe we do need all this infrastructure then. That's why I'm asking
about the stuff you meantioned, I'm honestly not sure.

What I'm working towards though is a clean separation between buffered
and direct code paths, so that buffered IO can continue work with iovs
and for O_DIRECT the first thing you do is fill out a bio with pinned
pages and send it down to filesystem code or wherever it's going to go.

That make sense? I can show you more concretely what I'm working on if
you want. Or if I'm full of crap and this is useless for what you guys
want I'm sure you'll let me know :)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: linux-next: manual merge of the block tree with the tree

2013-11-08 Thread Kent Overstreet
On Fri, Nov 08, 2013 at 12:32:51AM -0800, Christoph Hellwig wrote:
 On Fri, Nov 08, 2013 at 12:17:37AM -0800, Kent Overstreet wrote:
  The core issue isn't whether the IO is going to a block based filesystem
  (but thanks for pointing out that that's not necessarily true!) but
  whether we want to work with pinned pages or not. If pinned pages are ok
  for everything, then bios as a common interface work - likely evolving
  them a bit to be more general (it's just bi_bdev and bi_sector that's
  actually block specific) - and IMO that would be far preferable to this
  abstraction layer.
  
  If OTOH we need a common interface that's also for places where we can't
  afford the overhead of pinning user pages - that's a different story,
  and maybe we do need all this infrastructure then. That's why I'm asking
  about the stuff you meantioned, I'm honestly not sure.
 
 For both of them we will deal with kernel-allocated pages that are never
 mapped to userspace.  This is likely to be true for all the consumers
 of in-kernel aio/dio as the existing interfaces handle user pages just
 fine.

Ok, that's good to know.

  What I'm working towards though is a clean separation between buffered
  and direct code paths, so that buffered IO can continue work with iovs
  and for O_DIRECT the first thing you do is fill out a bio with pinned
  pages and send it down to filesystem code or wherever it's going to go.
 
 I don't think pushing bios above the fs interface is a good idea. Note
 that the iovecs come from userspace for the user pages cases, so there
 is little we can do about that, and non-bio based direct I/O
 implementations generally work directly at just that level and never
 even touch the direct-io.c code.

Bios can point to userspage pages just fine (and they do today for DIO
to block devices/block based filesystems today). Don't think of bios as
block device IOs, just think of them as the equivalent of an iovec +
iov_iter except instead of (potentially userspace) pointers you have
page pointers. That's the core part of what they do (and even if we
don't standardize on bios for that we should standardize on _something_
for that functionality).

Here's the helper function I wrote for my dio rewrite - it should really
take an iov_iter instead of uaddr and len, but user iovec - bio is the
easy bit:

http://evilpiepirate.org/git/linux-bcache.git/commit/?h=block_stuffid=4462c03167767c656986afaf981f891705fd5d3b

 If you want to redo the -direct_IO address_space operation and
 generic_file_direct_write and the direct I/O side of
 generic_file_aio_read (both of which aren't anywhere near as generic as
 the name claims) I'm all for it, but it really won't affect the consumer
 of the in-kernel aio/dio code.

I'm skeptical, but I'm way too tired to make good arguments and this
touches on too much code that I'm less familiar with.

also the flow of control in this code is such a goddamn clusterfuck I
don't even know what to say.

I'll dig more into the ecryptfs and target aio stuff tomorrow though.

  That make sense? I can show you more concretely what I'm working on if
  you want. Or if I'm full of crap and this is useless for what you guys
  want I'm sure you'll let me know :)
 
 It sounds interesting, but also a little confusing at this point, at
 least from the non-block side of view.

Zack, you want to chime in? He was involved in the discussion yesterday,
he might be able to explain this stuff better than I.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 16/22] dm: Refactor for new bio cloning/splitting

2013-10-02 Thread Kent Overstreet
On Sat, Sep 28, 2013 at 12:59:09AM -0400, Mike Snitzer wrote:
 Hey Kent,
 
 I haven't been able to pinpoint the issue yet, but using your for-jens
 branch, if I create a dm-thin volume with this lvm command:
 lvcreate -L20G -V20G -T vg/pool --name thinlv
 
 and try to format /dev/vg/thinlv with XFS the kernel warns and then
 hangs with the following:

Thanks, for whatever reason ext4 wasn't tickling that codepath but xfs
does. I folded the fix into my for-jens branch, here's what changed:

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b60b350..79cee1a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1098,7 +1098,7 @@ static void clone_bio(struct dm_target_io *tio, struct 
bio *bio,
bio_integrity_clone(clone, bio, GFP_NOIO);
 
bio_advance(clone, to_bytes(sector - clone-bi_iter.bi_sector));
-   bio-bi_iter.bi_size = to_bytes(len);
+   clone-bi_iter.bi_size = to_bytes(len);
 
if (bio_integrity(bio))
bio_integrity_trim(clone, 0, len);
@@ -1267,7 +1267,7 @@ static int __split_and_process_non_flush(struct 
clone_info *ci)
if (!dm_target_is_valid(ti))
return -EIO;
 
-   len = min_t(unsigned, max_io_len(ci-sector, ti), bio_sectors(bio));
+   len = min_t(sector_t, max_io_len(ci-sector, ti), ci-sector_count);
 
__clone_and_map_data_bio(ci, ti, ci-sector, len);
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 16/22] dm: Refactor for new bio cloning/splitting

2013-10-03 Thread Kent Overstreet
On Wed, Oct 02, 2013 at 11:23:21PM -0400, Mike Snitzer wrote:
 Great, thanks for finding this, I'll test and review the code further.

Cool - let me know if you find anything else (how thoroughly do you
think you've tested it so far?)
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] block: submit_bio_wait() conversions

2013-11-23 Thread Kent Overstreet
It was being open coded in a few places.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Joern Engel jo...@logfs.org
Cc: Prasad Joshi prasadjoshi.li...@gmail.com
Cc: Neil Brown ne...@suse.de
Cc: Chris Mason chris.ma...@fusionio.com
---
 block/blk-flush.c  | 19 +--
 drivers/md/md.c| 12 +---
 fs/btrfs/check-integrity.c | 32 +---
 fs/btrfs/check-integrity.h |  2 ++
 fs/btrfs/extent_io.c   | 12 +---
 fs/btrfs/scrub.c   | 33 -
 fs/hfsplus/wrapper.c   | 17 +
 fs/logfs/dev_bdev.c| 13 +
 8 files changed, 24 insertions(+), 116 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 331e627..fb6f3c0 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
}
 }
 
-static void bio_end_flush(struct bio *bio, int err)
-{
-   if (err)
-   clear_bit(BIO_UPTODATE, bio-bi_flags);
-   if (bio-bi_private)
-   complete(bio-bi_private);
-   bio_put(bio);
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:  blockdev to issue flush for
@@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
 {
-   DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
int ret = 0;
@@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
return -ENXIO;
 
bio = bio_alloc(gfp_mask, 0);
-   bio-bi_end_io = bio_end_flush;
bio-bi_bdev = bdev;
-   bio-bi_private = wait;
 
-   bio_get(bio);
-   submit_bio(WRITE_FLUSH, bio);
-   wait_for_completion_io(wait);
+   ret = submit_bio_wait(WRITE_FLUSH, bio);
 
/*
 * The driver must store the error location in -bi_sector, if
@@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
if (error_sector)
*error_sector = bio-bi_sector;
 
-   if (!bio_flagged(bio, BIO_UPTODATE))
-   ret = -EIO;
-
bio_put(bio);
return ret;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6b7a28..8700de3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -776,16 +776,10 @@ void md_super_wait(struct mddev *mddev)
finish_wait(mddev-sb_wait, wq);
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-   complete((struct completion*)bio-bi_private);
-}
-
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 struct page *page, int rw, bool metadata_op)
 {
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev-mddev);
-   struct completion event;
int ret;
 
rw |= REQ_SYNC;
@@ -801,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, 
int size,
else
bio-bi_sector = sector + rdev-data_offset;
bio_add_page(bio, page, size, 0);
-   init_completion(event);
-   bio-bi_private = event;
-   bio-bi_end_io = bi_complete;
-   submit_bio(rw, bio);
-   wait_for_completion(event);
+   submit_bio_wait(rw, bio);
 
ret = test_bit(BIO_UPTODATE, bio-bi_flags);
bio_put(bio);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b50764b..131d828 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct 
btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
  struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
for (i = 0; i  num_pages;) {
struct bio *bio;
unsigned int j;
-   DECLARE_COMPLETION_ONSTACK(complete);
 
bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
if (!bio) {
@@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
}
bio-bi_bdev = block_ctx-dev-bdev;
bio-bi_sector = dev_bytenr  9;
-   bio-bi_end_io = btrfsic_complete_bio_end_io;
-   bio-bi_private = complete;
 
for (j = i; j  num_pages; j++) {
ret = bio_add_page(bio, block_ctx-pagev[j],
@@ -1712,12 +1708,7 @@ static int btrfsic_read_block

[GIT PULL] Immutable biovecs

2013-11-25 Thread Kent Overstreet
Jens - here's immutable biovecs, rebased and ready for 3.14. Changes since the
last version of the series:

 * bio_clone_bioset() retains the old behaviour, as previously discussed -
   bio_clone_fast() is being used by bcache, dm and the new bio_split().

   There aren't that many users of the old bio_clone() and we should eventually
   convert all the users that don't actually need it (for a couple of reasons),
   but that can be left to future cleanups.

 * refactoring patches to use submit_bio_wait() and bio_for_each_segment()

Regarding the bio chaining stuff, I'm not terribly happy with all the
bio_endio_nodec() calls but I want to keep that stuff simple for now - Mike
Snitzer had a good suggestion for that though. But I am seeing more and more
ways we can use that functionality to clean other stuff up - also, we should be
able to drop bi_cnt eventually, bi_remaining seems like it can be used to
replace the various bio_get() uses.

The following changes since commit 6ce4eac1f600b34f2f7f58f9cd8f0503d79e42ae:

  Linux 3.13-rc1 (2013-11-22 11:30:55 -0800)

are available in the git repository at:

  git://evilpiepirate.org/~kent/linux-bcache.git for-jens

for you to fetch changes up to 4b1faf931650d4a35b2a570318862821d6a962e3:

  block: Kill bio_pair_split() (2013-11-23 22:33:57 -0800)


Kent Overstreet (25):
  block: submit_bio_wait() conversions
  block: Convert various code to bio_for_each_segment()
  bcache: Kill unaligned bvec hack
  block: Abstract out bvec iterator
  dm: Use bvec_iter for dm_bio_record()
  block: Convert bio_iovec() to bvec_iter
  block: Convert bio_for_each_segment() to bvec_iter
  block: Immutable bio vecs
  block: Convert bio_copy_data() to bvec_iter
  bio-integrity: Convert to bvec_iter
  block: Kill bio_segments()/bi_vcnt usage
  block: Convert drivers to immutable biovecs
  aoe: Convert to immutable biovecs
  ceph: Convert to immutable biovecs
  block: Kill bio_iovec_idx(), __bio_iovec()
  block: Refactor bio_clone_bioset() for immutable biovecs
  block: Add bio_clone_fast()
  rbd: Refactor bio cloning
  dm: Refactor for new bio cloning/splitting
  block: Don't save/copy bvec array anymore
  block: Remove bi_idx hacks
  block: Generic bio chaining
  block: Rename bio_split() - bio_pair_split()
  block: Introduce new bio_split()
  block: Kill bio_pair_split()

 Documentation/block/biodoc.txt  |   7 +-
 Documentation/block/biovecs.txt | 111 ++
 arch/m68k/emu/nfblock.c |  13 +-
 arch/powerpc/sysdev/axonram.c   |  21 +-
 block/blk-core.c|  40 +--
 block/blk-flush.c   |  21 +-
 block/blk-integrity.c   |  40 ++-
 block/blk-lib.c |  12 +-
 block/blk-map.c |   6 +-
 block/blk-merge.c   |  66 ++--
 block/blk-mq.c  |   2 +-
 block/blk-throttle.c|  14 +-
 block/elevator.c|   2 +-
 drivers/block/aoe/aoe.h |  10 +-
 drivers/block/aoe/aoecmd.c  | 153 -
 drivers/block/brd.c |  16 +-
 drivers/block/drbd/drbd_actlog.c|   2 +-
 drivers/block/drbd/drbd_bitmap.c|   2 +-
 drivers/block/drbd/drbd_main.c  |  27 +-
 drivers/block/drbd/drbd_receiver.c  |  19 +-
 drivers/block/drbd/drbd_req.c   |   6 +-
 drivers/block/drbd/drbd_req.h   |   2 +-
 drivers/block/drbd/drbd_worker.c|   8 +-
 drivers/block/floppy.c  |  16 +-
 drivers/block/loop.c|  27 +-
 drivers/block/mtip32xx/mtip32xx.c   |  20 +-
 drivers/block/nbd.c |  14 +-
 drivers/block/nvme-core.c   | 142 ++--
 drivers/block/pktcdvd.c | 182 +-
 drivers/block/ps3disk.c |  17 +-
 drivers/block/ps3vram.c |  12 +-
 drivers/block/rbd.c |  91 +
 drivers/block/rsxx/dev.c|   6 +-
 drivers/block/rsxx/dma.c|  15 +-
 drivers/block/umem.c|  53 ++-
 drivers/block/xen-blkback/blkback.c |   2 +-
 drivers/block/xen-blkfront.c|   2 +-
 drivers/md/bcache/bcache.h  |   2 -
 drivers/md/bcache/btree.c   |   8 +-
 drivers/md/bcache/debug.c   |  21 +-
 drivers/md/bcache/io.c  | 196 ++-
 drivers/md/bcache/journal.c |  12 +-
 drivers/md/bcache/movinggc.c|   4 +-
 drivers/md/bcache/request.c | 131 +++-
 drivers/md/bcache/super.c   |  20 +-
 drivers/md/bcache/util.c

[PATCH] Make generic_make_request() handle arbitrary size bios

2013-11-25 Thread Kent Overstreet
This builds off of immutable biovecs; it's a small enough patch series that I'd
like to try to get it into 3.14 but it is a pretty significant change in
behaviour for the block layer so it should definitely be considered separately
from that series.

What the series does is pretty simple - like the title says, you can now pass
generic_make_request arbitrary size bios - you no longer have to care about
device limits, and we can start deleting a ton of code.

This is done by splitting bios as needed. The idea is to push the splitting
lower and lower over time, and in many cases we eventually won't have to
actually split the bios at all - it'll happen implicitly - for now though, we do
it the simple easy way, with a new function (blk_queue_split()) that mimics the
old behaviour of bio_add_page().

This series only starts with the obvious easy things - bio_add_page(), and the
stuff in blk-lib.c. There's a lot more coming - with the new bio_split() that
can split arbitrary size bios the various virtual block drivers (md, dm, drbd,
etc.) should all either be capable of handling arbitrary size bios now or mostly
need minor changes, so we can delete merge_bvec_fn.

This patch series doesn't with REQ_TYPE_BLOCK_PC bios - bio_add_pc_page() and
the blk_execute_rq() codepath are unchanged. I'd like to get rid of direct
request submission eventually, but that is a different patch series.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] block: Make generic_make_request handle arbitrary sized bios

2013-11-25 Thread Kent Overstreet
The way the block layer is currently written, it goes to great lengths
to avoid having to split bios; upper layer code (such as bio_add_page())
checks what the underlying device can handle and tries to always create
bios that don't need to be split.

But this approach becomes unwieldy and eventually breaks down with
stacked devices and devices with dynamic limits, and it adds a lot of
complexity. If the block layer could split bios as needed, we could
eliminate a lot of complexity elsewhere - particularly in stacked
drivers. Code that creates bios can then create whatever size bios are
convenient, and more importantly stacked drivers don't have to deal with
both their own bio size limitations and the limitations of the
(potentially multiple) devices underneath them.  In the future this will
let us delete merge_bvec_fn and a bunch of other code.

We do this by adding calls to blk_queue_split() to the various
make_request functions that need it - a few can already handle arbitrary
size bios. Note that we add the call _after_ any call to blk_queue_bounce();
this means that blk_queue_split() and blk_recalc_rq_segments() don't need to be
concerned with bouncing affecting segment merging.

Some make_request_fns were simple enough to audit and verify they don't
need blk_queue_split() calls. The skipped ones are:

 * nfhd_make_request (arch/m68k/emu/nfblock.c)
 * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
 * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
 * brd_make_request (ramdisk - drivers/block/brd.c)
 * loop_make_request
 * null_queue_bio
 * bcache's make_request fns

Some others are almost certainly safe to remove now, but will be left for future
patches.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Neil Brown ne...@suse.de
Cc: Alasdair Kergon a...@redhat.com
Cc: dm-de...@redhat.com
---
 block/blk-core.c|  19 ++--
 block/blk-merge.c   | 150 ++--
 block/blk-mq.c  |   2 +
 drivers/block/drbd/drbd_req.c   |   2 +
 drivers/block/mtip32xx/mtip32xx.c   |   6 +-
 drivers/block/nvme-core.c   |   2 +
 drivers/block/pktcdvd.c |   6 +-
 drivers/block/ps3vram.c |   2 +
 drivers/block/rsxx/dev.c|   2 +
 drivers/block/umem.c|   2 +
 drivers/md/dm.c |   2 +
 drivers/md/md.c |   2 +
 drivers/s390/block/dcssblk.c|   2 +
 drivers/s390/block/xpram.c  |   2 +
 drivers/staging/lustre/lustre/llite/lloop.c |   2 +
 drivers/staging/zram/zram_drv.c |   2 +
 include/linux/blkdev.h  |   3 +
 17 files changed, 185 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5da8e90..5cc7171 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -566,6 +566,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (q-id  0)
goto fail_c;
 
+   q-bio_split = bioset_create(4, 0);
+   if (!q-bio_split)
+   goto fail_id;
+
q-backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q-backing_dev_info.state = 0;
@@ -575,7 +579,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
err = bdi_init(q-backing_dev_info);
if (err)
-   goto fail_id;
+   goto fail_split;
 
setup_timer(q-backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
@@ -620,6 +624,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
 fail_bdi:
bdi_destroy(q-backing_dev_info);
+fail_split:
+   bioset_free(q-bio_split);
 fail_id:
ida_simple_remove(blk_queue_ida, q-id);
 fail_c:
@@ -1472,6 +1478,8 @@ void blk_queue_bio(struct request_queue *q, struct bio 
*bio)
struct request *req;
unsigned int request_count = 0;
 
+   blk_queue_split(q, bio, q-bio_split);
+
/*
 * low level driver can indicate that it wants pages above a
 * certain limit bounced to low memory (ie for highmem, or even
@@ -1694,15 +1702,6 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
 
-   if (likely(bio_is_rw(bio) 
-  nr_sectors  queue_max_hw_sectors(q))) {
-   printk(KERN_ERR bio too big device %s (%u  %u)\n,
-  bdevname(bio-bi_bdev, b),
-  bio_sectors(bio),
-  queue_max_hw_sectors(q));
-   goto end_io;
-   }
-
part = bio-bi_bdev-bd_part;
if (should_fail_request(part, bio-bi_iter.bi_size) ||
should_fail_request(part_to_disk(part)-part0,
diff --git a/block/blk-merge.c b/block/blk

[PATCH 4/5] blk-lib.c: generic_make_request() handles large bios now

2013-11-25 Thread Kent Overstreet
generic_make_request() will now do for us what the code in blk-lib.c was
doing manually, with the bio_batch stuff - we still need some looping in
case we're trying to discard/zeroout more than around a gigabyte, but
when we can submit that much at a time doing the submissions in parallel
really shouldn't matter.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 block/blk-lib.c | 175 ++--
 1 file changed, 30 insertions(+), 145 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2da76c9..368c36a 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -9,23 +9,6 @@
 
 #include blk.h
 
-struct bio_batch {
-   atomic_tdone;
-   unsigned long   flags;
-   struct completion   *wait;
-};
-
-static void bio_batch_end_io(struct bio *bio, int err)
-{
-   struct bio_batch *bb = bio-bi_private;
-
-   if (err  (err != -EOPNOTSUPP))
-   clear_bit(BIO_UPTODATE, bb-flags);
-   if (atomic_dec_and_test(bb-done))
-   complete(bb-wait);
-   bio_put(bio);
-}
-
 /**
  * blkdev_issue_discard - queue a discard
  * @bdev:  blockdev to issue discard for
@@ -40,15 +23,10 @@ static void bio_batch_end_io(struct bio *bio, int err)
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
 {
-   DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD;
-   unsigned int max_discard_sectors, granularity;
-   int alignment;
-   struct bio_batch bb;
struct bio *bio;
int ret = 0;
-   struct blk_plug plug;
 
if (!q)
return -ENXIO;
@@ -56,78 +34,28 @@ int blkdev_issue_discard(struct block_device *bdev, 
sector_t sector,
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
 
-   /* Zero-sector (unknown) and one-sector granularities are the same.  */
-   granularity = max(q-limits.discard_granularity  9, 1U);
-   alignment = (bdev_discard_alignment(bdev)  9) % granularity;
-
-   /*
-* Ensure that max_discard_sectors is of the proper
-* granularity, so that requests stay aligned after a split.
-*/
-   max_discard_sectors = min(q-limits.max_discard_sectors, UINT_MAX  9);
-   max_discard_sectors -= max_discard_sectors % granularity;
-   if (unlikely(!max_discard_sectors)) {
-   /* Avoid infinite loop below. Being cautious never hurts. */
-   return -EOPNOTSUPP;
-   }
-
if (flags  BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
type |= REQ_SECURE;
}
 
-   atomic_set(bb.done, 1);
-   bb.flags = 1  BIO_UPTODATE;
-   bb.wait = wait;
-
-   blk_start_plug(plug);
while (nr_sects) {
-   unsigned int req_sects;
-   sector_t end_sect, tmp;
-
bio = bio_alloc(gfp_mask, 1);
-   if (!bio) {
-   ret = -ENOMEM;
-   break;
-   }
-
-   req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
-
-   /*
-* If splitting a request, and the next starting sector would be
-* misaligned, stop the discard at the previous aligned sector.
-*/
-   end_sect = sector + req_sects;
-   tmp = end_sect;
-   if (req_sects  nr_sects 
-   sector_div(tmp, granularity) != alignment) {
-   end_sect = end_sect - alignment;
-   sector_div(end_sect, granularity);
-   end_sect = end_sect * granularity + alignment;
-   req_sects = end_sect - sector;
-   }
+   if (!bio)
+   return -ENOMEM;
 
-   bio-bi_iter.bi_sector = sector;
-   bio-bi_end_io = bio_batch_end_io;
bio-bi_bdev = bdev;
-   bio-bi_private = bb;
+   bio-bi_iter.bi_sector = sector;
+   bio-bi_iter.bi_size = min_t(sector_t, nr_sects, 1  20)  9;
 
-   bio-bi_iter.bi_size = req_sects  9;
-   nr_sects -= req_sects;
-   sector = end_sect;
+   sector += bio_sectors(bio);
+   nr_sects -= bio_sectors(bio);
 
-   atomic_inc(bb.done);
-   submit_bio(type, bio);
+   ret = submit_bio_wait(type, bio);
+   if (ret)
+   break;
}
-   blk_finish_plug(plug);
-
-   /* Wait for bios in-flight */
-   if (!atomic_dec_and_test(bb.done))
-   wait_for_completion_io(wait);
-
-   if (!test_bit(BIO_UPTODATE, bb.flags))
-   ret = -EIO;
 
return

[PATCH 2/5] mtip32xx: handle arbitrary size bios

2013-11-25 Thread Kent Overstreet
We get a measurable performance increase by handling this in the driver when
we're already looping over the biovec, instead of handling it separately in
generic_make_request() (or bio_add_page() originally)

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Asai Thambi S P asamymuth...@micron.com
Cc: Sam Bradshaw sbrads...@micron.com
---
 drivers/block/mtip32xx/mtip32xx.c | 47 ---
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 1b3dfee..2b142eb 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2648,24 +2648,6 @@ static void mtip_hw_submit_io(struct driver_data *dd, 
sector_t sector,
 }
 
 /*
- * Release a command slot.
- *
- * @dd  Pointer to the driver data structure.
- * @tag Slot tag
- *
- * return value
- *  None
- */
-static void mtip_hw_release_scatterlist(struct driver_data *dd, int tag,
-   int unaligned)
-{
-   struct semaphore *sem = unaligned ? dd-port-cmd_slot_unal :
-   dd-port-cmd_slot;
-   release_slot(dd-port, tag);
-   up(sem);
-}
-
-/*
  * Obtain a command slot and return its associated scatter list.
  *
  * @dd  Pointer to the driver data structure.
@@ -3969,8 +3951,6 @@ static void mtip_make_request(struct request_queue 
*queue, struct bio *bio)
 
blk_queue_bounce(queue, bio);
 
-   blk_queue_split(queue, bio, q-bio_split);
-
if (unlikely(dd-dd_flag  MTIP_DDF_STOP_IO)) {
if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
dd-dd_flag))) {
@@ -4020,21 +4000,22 @@ static void mtip_make_request(struct request_queue 
*queue, struct bio *bio)
 
sg = mtip_hw_get_scatterlist(dd, tag, unaligned);
if (likely(sg != NULL)) {
-   if (unlikely((bio)-bi_vcnt  MTIP_MAX_SG)) {
-   dev_warn(dd-pdev-dev,
-   Maximum number of SGL entries exceeded\n);
-   bio_io_error(bio);
-   mtip_hw_release_scatterlist(dd, tag, unaligned);
-   return;
-   }
-
/* Create the scatter list for this bio. */
bio_for_each_segment(bvec, bio, iter) {
-   sg_set_page(sg[nents],
-   bvec.bv_page,
-   bvec.bv_len,
-   bvec.bv_offset);
-   nents++;
+   if (unlikely(nents == MTIP_MAX_SG)) {
+   struct bio *split = bio_clone(bio, GFP_NOIO);
+
+   split-bi_iter = iter;
+   bio-bi_iter.bi_size -= iter.bi_size;
+   bio_chain(split, bio);
+   generic_make_request(split);
+   break;
+   }
+
+   sg_set_page(sg[nents++],
+   bvec.bv_page,
+   bvec.bv_len,
+   bvec.bv_offset);
}
 
/* Issue the read/write. */
-- 
1.8.4.3

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] block: Gut bio_add_page()

2013-11-25 Thread Kent Overstreet
Since generic_make_request() can now handle arbitrary size bios, all we
have to do is make sure the bvec array doesn't overflow.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 fs/bio.c | 137 ++-
 1 file changed, 57 insertions(+), 80 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 75c49a3..da27557 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -690,9 +690,23 @@ int bio_get_nr_vecs(struct block_device *bdev)
 }
 EXPORT_SYMBOL(bio_get_nr_vecs);
 
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors)
+/**
+ * bio_add_pc_page -   attempt to add page to bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by REQ_PC bios.
+ */
+int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
+   *page, unsigned int len, unsigned int offset)
 {
int retried_segments = 0;
struct bio_vec *bvec;
@@ -703,7 +717,7 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
 
-   if (((bio-bi_iter.bi_size + len)  9)  max_sectors)
+   if (((bio-bi_iter.bi_size + len)  9)  queue_max_hw_sectors(q))
return 0;
 
/*
@@ -716,28 +730,7 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
 
if (page == prev-bv_page 
offset == prev-bv_offset + prev-bv_len) {
-   unsigned int prev_bv_len = prev-bv_len;
prev-bv_len += len;
-
-   if (q-merge_bvec_fn) {
-   struct bvec_merge_data bvm = {
-   /* prev_bvec is already charged in
-  bi_size, discharge it in order to
-  simulate merging updated prev_bvec
-  as new bvec. */
-   .bi_bdev = bio-bi_bdev,
-   .bi_sector = bio-bi_iter.bi_sector,
-   .bi_size = bio-bi_iter.bi_size -
-   prev_bv_len,
-   .bi_rw = bio-bi_rw,
-   };
-
-   if (q-merge_bvec_fn(q, bvm, prev)  
prev-bv_len) {
-   prev-bv_len -= len;
-   return 0;
-   }
-   }
-
goto done;
}
}
@@ -768,31 +761,6 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
bvec-bv_len = len;
bvec-bv_offset = offset;
 
-   /*
-* if queue has other restrictions (eg varying max sector size
-* depending on offset), it can specify a merge_bvec_fn in the
-* queue to get further control
-*/
-   if (q-merge_bvec_fn) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev = bio-bi_bdev,
-   .bi_sector = bio-bi_iter.bi_sector,
-   .bi_size = bio-bi_iter.bi_size,
-   .bi_rw = bio-bi_rw,
-   };
-
-   /*
-* merge_bvec_fn() returns number of bytes it can accept
-* at this offset
-*/
-   if (q-merge_bvec_fn(q, bvm, bvec)  bvec-bv_len) {
-   bvec-bv_page = NULL;
-   bvec-bv_len = 0;
-   bvec-bv_offset = 0;
-   return 0;
-   }
-   }
-
/* If we may be able to merge these biovecs, force a recount */
if (bio-bi_vcnt  (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio-bi_flags = ~(1  BIO_SEG_VALID);
@@ -803,28 +771,6 @@ static int __bio_add_page(struct request_queue *q, struct 
bio *bio, struct page
bio-bi_iter.bi_size += len;
return len;
 }
-
-/**
- * bio_add_pc_page -   attempt to add page to bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist

[PATCH 5/5] bcache: generic_make_request() handles large bios now

2013-11-25 Thread Kent Overstreet
So we get to delete our hacky workaround.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/md/bcache/bcache.h|  18 
 drivers/md/bcache/io.c| 100 +-
 drivers/md/bcache/journal.c   |   4 +-
 drivers/md/bcache/request.c   |  16 +++
 drivers/md/bcache/super.c |  32 +-
 drivers/md/bcache/util.h  |   5 ++-
 drivers/md/bcache/writeback.c |   4 +-
 7 files changed, 18 insertions(+), 161 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 964353c..8f65331 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -241,19 +241,6 @@ struct keybuf {
DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
-struct bio_split_pool {
-   struct bio_set  *bio_split;
-   mempool_t   *bio_split_hook;
-};
-
-struct bio_split_hook {
-   struct closure  cl;
-   struct bio_split_pool   *p;
-   struct bio  *bio;
-   bio_end_io_t*bi_end_io;
-   void*bi_private;
-};
-
 struct bcache_device {
struct closure  cl;
 
@@ -286,8 +273,6 @@ struct bcache_device {
int (*cache_miss)(struct btree *, struct search *,
  struct bio *, unsigned);
int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct io {
@@ -465,8 +450,6 @@ struct cache {
atomic_long_t   meta_sectors_written;
atomic_long_t   btree_sectors_written;
atomic_long_t   sectors_written;
-
-   struct bio_split_pool   bio_split_hook;
 };
 
 struct gc_stat {
@@ -901,7 +884,6 @@ void bch_bbio_endio(struct cache_set *, struct bio *, int, 
const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
-void bch_generic_make_request(struct bio *, struct bio_split_pool *);
 void __bch_submit_bbio(struct bio *, struct cache_set *);
 void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, 
unsigned);
 
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa..86a0bb8 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,104 +11,6 @@
 
 #include linux/blkdev.h
 
-static unsigned bch_bio_max_sectors(struct bio *bio)
-{
-   struct request_queue *q = bdev_get_queue(bio-bi_bdev);
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   unsigned ret = 0, seg = 0;
-
-   if (bio-bi_rw  REQ_DISCARD)
-   return min(bio_sectors(bio), q-limits.max_discard_sectors);
-
-   bio_for_each_segment(bv, bio, iter) {
-   struct bvec_merge_data bvm = {
-   .bi_bdev= bio-bi_bdev,
-   .bi_sector  = bio-bi_iter.bi_sector,
-   .bi_size= ret  9,
-   .bi_rw  = bio-bi_rw,
-   };
-
-   if (seg == min_t(unsigned, BIO_MAX_PAGES,
-queue_max_segments(q)))
-   break;
-
-   if (q-merge_bvec_fn 
-   q-merge_bvec_fn(q, bvm, bv)  (int) bv.bv_len)
-   break;
-
-   seg++;
-   ret += bv.bv_len  9;
-   }
-
-   ret = min(ret, queue_max_sectors(q));
-
-   WARN_ON(!ret);
-   ret = max_t(int, ret, bio_iovec(bio).bv_len  9);
-
-   return ret;
-}
-
-static void bch_bio_submit_split_done(struct closure *cl)
-{
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   s-bio-bi_end_io = s-bi_end_io;
-   s-bio-bi_private = s-bi_private;
-   bio_endio_nodec(s-bio, 0);
-
-   closure_debug_destroy(s-cl);
-   mempool_free(s, s-p-bio_split_hook);
-}
-
-static void bch_bio_submit_split_endio(struct bio *bio, int error)
-{
-   struct closure *cl = bio-bi_private;
-   struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
-
-   if (error)
-   clear_bit(BIO_UPTODATE, s-bio-bi_flags);
-
-   bio_put(bio);
-   closure_put(cl);
-}
-
-void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
-{
-   struct bio_split_hook *s;
-   struct bio *n;
-
-   if (!bio_has_data(bio)  !(bio-bi_rw  REQ_DISCARD))
-   goto submit;
-
-   if (bio_sectors(bio) = bch_bio_max_sectors(bio))
-   goto submit;
-
-   s = mempool_alloc(p-bio_split_hook, GFP_NOIO);
-   closure_init(s-cl, NULL);
-
-   s-bio  = bio;
-   s-p= p;
-   s-bi_end_io= bio-bi_end_io;
-   s-bi_private   = bio-bi_private;
-   bio_get(bio);
-
-   do {
-   n = bio_next_split(bio, bch_bio_max_sectors(bio),
-  GFP_NOIO, s-p-bio_split);
-
-   n-bi_end_io= bch_bio_submit_split_endio

Re: GPF in aio_migratepage

2013-11-25 Thread Kent Overstreet
On Tue, Nov 26, 2013 at 01:01:32AM -0500, Dave Jones wrote:
 On Mon, Nov 25, 2013 at 10:26:45PM -0500, Dave Jones wrote:
   Hi Kent,
   
   I hit the GPF below on a tree based on 
 8e45099e029bb6b369b27d8d4920db8caff5ecce
   which has your commit e34ecee2ae791df674dfb466ce40692ca6218e43
   (aio: Fix a trinity splat).  Is this another path your patch missed, or
   a completely different bug to what you were chasing ?
 
 And here's another from a different path, this time on 32bit.

I'm pretty sure this is a different bug... it appears to be related to
aio ring buffer migration, which I don't think I've touched.

Any information on what it was doing at the time? I see exit_aio() in
the second backtrace, maybe some sort of race between migratepage and
ioctx teardown? But it is using the address space mapping, so I dunno.

I don't see what's protecting ctx-ring_pages - I imagine it's got to
have something to do with the page migration machinery but I have no
idea how that works. Ben?

 Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC
 Modules linked in: tun fuse hidp rfcomm bnep scsi_transport_iscsi l2tp_ppp 
 l2tp_netlink l2tp_core nfc caif_socket caif af_802154 phonet af_rxrpc 
 bluetooth rfkill can_raw can_bcm can llc2 pppoe pppox ppp_generic slhc irda 
 crc_ccitt rds af_key rose x25 atm netrom appletalk ipx p8023 p8022 psnap llc 
 ax25 nouveau video backlight mxm_wmi wmi i2c_algo_bit ttm drm_kms_helper drm 
 i2c_core kvm_intel kvm tg3 ptp pps_core libphy serio_raw pcspkr lpc_ich 
 microcode mfd_core rtc_cmos parport_pc parport shpchp xfs libcrc32c raid0 
 floppy
 CPU: 0 PID: 4517 Comm: trinity-child0 Not tainted 3.13.0-rc1+ #6 
 Hardware name: Dell Inc. Precision WorkStation 490
 /0DT031, BIOS A08 04/25/2008
 task: ed899630 ti: dea22000 task.ti: dea22000
 EIP: 0060:[c11c7a02] EFLAGS: 00010293 CPU: 0
 EIP is at aio_migratepage+0xad/0x126
 EAX: 0144 EBX: f6844ed8 ECX: deaf4a84 EDX: 6b6b6b6b
 ESI: f68dc508 EDI: deaf4800 EBP: dea23bcc ESP: dea23ba8
  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
 CR0: 8005003b CR2: 6b6b707b CR3: 2c985000 CR4: 07f0
 Stack:
   0001 deaf4a84 0286 d709b280  f68dc508 c11c7955
  f6844ed8 dea23c0c c116aa9f 0001 0001 c11c7955 c1179a33 
   c114166d f6844ed8 f6844ed8 c1140fc9 dea23c0c  f6844ed8
 Call Trace:
  [c11c7955] ? free_ioctx+0x62/0x62
  [c116aa9f] move_to_new_page+0x63/0x1bb
  [c11c7955] ? free_ioctx+0x62/0x62
  [c1179a33] ? mem_cgroup_prepare_migration+0xc1/0x243
  [c114166d] ? isolate_migratepages_range+0x3fb/0x675
  [c1140fc9] ? isolate_freepages_block+0x316/0x316
  [c116b319] migrate_pages+0x614/0x72b
  [c1140fc9] ? isolate_freepages_block+0x316/0x316
  [c1141c21] compact_zone+0x294/0x475
  [c1142065] try_to_compact_pages+0x129/0x196
  [c15b95e7] __alloc_pages_direct_compact+0x91/0x197
  [c112a25c] __alloc_pages_nodemask+0x863/0xa55
  [c116b68f] get_huge_zero_page+0x52/0xf9
  [c116ef78] do_huge_pmd_anonymous_page+0x24e/0x39f
  [c1171c4b] ? __mem_cgroup_count_vm_event+0xa6/0x191
  [c1171c64] ? __mem_cgroup_count_vm_event+0xbf/0x191
  [c114815c] handle_mm_fault+0x235/0xd9a
  [c15c7586] ? __do_page_fault+0xf8/0x5a1
  [c15c75ee] __do_page_fault+0x160/0x5a1
  [c15c7586] ? __do_page_fault+0xf8/0x5a1
  [c15c7a2f] ? __do_page_fault+0x5a1/0x5a1
  [c15c7a3c] do_page_fault+0xd/0xf
  [c15c4e7c] error_code+0x6c/0x74
  [c114007b] ? memcg_update_all_caches+0x23/0x6b
  [c12d0be5] ? __copy_from_user_ll+0x30/0xdb
  [c12d0ccf] _copy_from_user+0x3f/0x55
  [c1057aa2] SyS_setrlimit+0x27/0x50
  [c1044792] ? SyS_gettimeofday+0x33/0x6d
  [c12d0798] ? trace_hardirqs_on_thunk+0xc/0x10
  [c15cb33b] sysenter_do_call+0x12/0x32
 Code: 6e 8d 8f 84 02 00 00 89 c8 89 4d e4 e8 df bf 3f 00 89 45 e8 89 da 89 f0 
 e8 99 2b fa ff 8b 43 08 3b 47 54 8b 4d e4 73 06 8b 57 50 89 34 82 8b 55 e8 
 89 c8 e8 aa c1 3f 00 8b 45 ec e8 28 c1 3f 00
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/5] block: Make generic_make_request handle arbitrary sized bios

2013-11-25 Thread Kent Overstreet
On Mon, Nov 25, 2013 at 10:09:21PM -0800, Christoph Hellwig wrote:
  +   q-bio_split = bioset_create(4, 0);
  +   if (!q-bio_split)
  +   goto fail_id;
 
 How did we arrive at a mempool size of 4 to make sure we can always make
 progress with arbitrarily sized bios?  Shouldn't we document the design
 decision somewhere?

It just has to be nonzero to guarantee forward progress - the
bio_alloc_bioset() rescuer thing I did awhile back guarantees that.

  +static struct bio *blk_bio_discard_split(struct request_queue *q,
  +struct bio *bio,
  +struct bio_set *bs)
  +{
 
 One fallout of this should be that it would allows us to impement
 a range UNMAP/TRIP much more easily, which would help dramatically
 with TRIM performance!

Cool!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/5] mtip32xx: handle arbitrary size bios

2013-11-25 Thread Kent Overstreet
On Mon, Nov 25, 2013 at 10:10:18PM -0800, Christoph Hellwig wrote:
 On Mon, Nov 25, 2013 at 02:30:30PM -0800, Kent Overstreet wrote:
  We get a measurable performance increase by handling this in the driver when
  we're already looping over the biovec, instead of handling it separately in
  generic_make_request() (or bio_add_page() originally)
 
 Given that Jens has a patch queue up to convert the driver to blk-mq
 adjusting the blk-mq infrastructure to handle this might be a better
 plan forward..

I mostly wanted the patch in the series as an example conversion -
converting blk-mq is going to take some thought and time to familiarize
myself with the code.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] block: Silence spurious compiler warnings

2013-11-26 Thread Kent Overstreet
From 46e7081430f5f483906f496733a23f8e9d898879 Mon Sep 17 00:00:00 2001
From: Kent Overstreet k...@daterainc.com
Date: Tue, 26 Nov 2013 16:36:49 -0800
Subject: [PATCH] block: Silence spurious compiler warnings

Signed-off-by: Kent Overstreet k...@daterainc.com
---

On Tue, Nov 26, 2013 at 12:02:08PM -0700, Jens Axboe wrote:
 On 11/26/2013 12:01 PM, Olof Johansson wrote:
  I just noticed that i see this with gcc 4.7.0, but 4.8.1 does not warn.
 
 That's good, because it's not a bug. But arguably we should shut up 4.7
 as well, however...

Here you go:

 block/blk-merge.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 05c17be..0b097f6 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -89,6 +89,8 @@ static int blk_phys_contig_segment(struct request_queue *q, 
struct bio *bio,
struct bio_vec end_bv, nxt_bv;
struct bvec_iter iter;
 
+   uninitialized_var(end_bv);
+
if (!blk_queue_cluster(q))
return 0;
 
@@ -173,6 +175,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request 
*rq,
struct scatterlist *sg;
int nsegs, cluster;
 
+   uninitialized_var(bvprv);
+
nsegs = 0;
cluster = blk_queue_cluster(q);
 
@@ -235,6 +239,8 @@ int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
int nsegs, cluster;
struct bvec_iter iter;
 
+   uninitialized_var(bvprv);
+
nsegs = 0;
cluster = blk_queue_cluster(q);
 
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [GIT PULL] Immutable biovecs

2013-11-26 Thread Kent Overstreet
On Mon, Nov 25, 2013 at 10:05:58PM -0800, Christoph Hellwig wrote:
 On Mon, Nov 25, 2013 at 01:52:16PM -0800, Kent Overstreet wrote:
  Jens - here's immutable biovecs, rebased and ready for 3.14. Changes since 
  the
  last version of the series:
 
 Can you do a resend of the patch series to all involved lists first so
 we can have a detailed look at the current version?

Sure thing.

   * bio_clone_bioset() retains the old behaviour, as previously discussed -
 bio_clone_fast() is being used by bcache, dm and the new bio_split().
 
 Any chance to have a more descriptive name than bio_clone_fast?  Also
 without having the actual patches in front of me, did you make sure to
 document the different in semantics in detail?

I haven't been able to think of anything I don't hate, I'm open to suggestions.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/25] dm: Use bvec_iter for dm_bio_record()

2013-11-26 Thread Kent Overstreet
This patch doesn't itself have any functional changes, but immutable
biovecs are going to add a bi_bvec_done member to bi_iter, which will
need to be saved too here.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Alasdair Kergon a...@redhat.com
Cc: dm-de...@redhat.com
Reviewed-by: Mike Snitzer snit...@redhat.com
---
 drivers/md/dm-bio-record.h | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 5ace48e..4f46e8e 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -28,11 +28,9 @@ struct dm_bio_vec_details {
 };
 
 struct dm_bio_details {
-   sector_t bi_sector;
struct block_device *bi_bdev;
-   unsigned int bi_size;
-   unsigned short bi_idx;
unsigned long bi_flags;
+   struct bvec_iter bi_iter;
struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
 };
 
@@ -40,11 +38,9 @@ static inline void dm_bio_record(struct dm_bio_details *bd, 
struct bio *bio)
 {
unsigned i;
 
-   bd-bi_sector = bio-bi_iter.bi_sector;
bd-bi_bdev = bio-bi_bdev;
-   bd-bi_size = bio-bi_iter.bi_size;
-   bd-bi_idx = bio-bi_iter.bi_idx;
bd-bi_flags = bio-bi_flags;
+   bd-bi_iter = bio-bi_iter;
 
for (i = 0; i  bio-bi_vcnt; i++) {
bd-bi_io_vec[i].bv_len = bio-bi_io_vec[i].bv_len;
@@ -56,11 +52,9 @@ static inline void dm_bio_restore(struct dm_bio_details *bd, 
struct bio *bio)
 {
unsigned i;
 
-   bio-bi_iter.bi_sector = bd-bi_sector;
bio-bi_bdev = bd-bi_bdev;
-   bio-bi_iter.bi_size = bd-bi_size;
-   bio-bi_iter.bi_idx = bd-bi_idx;
bio-bi_flags = bd-bi_flags;
+   bio-bi_iter = bd-bi_iter;
 
for (i = 0; i  bio-bi_vcnt; i++) {
bio-bi_io_vec[i].bv_len = bd-bi_io_vec[i].bv_len;
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/25] block: Convert bio_iovec() to bvec_iter

2013-11-26 Thread Kent Overstreet
For immutable biovecs, we'll be introducing a new bio_iovec() that uses
our new bvec iterator to construct a biovec, taking into account
bvec_iter-bi_bvec_done - this patch updates existing users for the new
usage.

Some of the existing users really do need a pointer into the bvec array
- those uses are all going to be removed, but we'll need the
functionality from immutable to remove them - so for now rename the
existing bio_iovec() - __bio_iovec(), and it'll be removed in a couple
patches.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Ed L. Cashin ecas...@coraid.com
Cc: Alasdair Kergon a...@redhat.com
Cc: dm-de...@redhat.com
Cc: James E.J. Bottomley jbottom...@parallels.com
---
 drivers/block/aoe/aoecmd.c |  2 +-
 drivers/md/bcache/io.c | 13 +++--
 drivers/md/dm-verity.c |  2 +-
 drivers/scsi/sd.c  |  2 +-
 fs/bio.c   | 20 ++--
 include/linux/bio.h| 10 ++
 6 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 877ba11..77c24ab 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -932,7 +932,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio 
*bio)
buf-resid = bio-bi_iter.bi_size;
buf-sector = bio-bi_iter.bi_sector;
bio_pageinc(bio);
-   buf-bv = bio_iovec(bio);
+   buf-bv = __bio_iovec(bio);
buf-bv_resid = buf-bv-bv_len;
WARN_ON(buf-bv_resid == 0);
 }
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index cc4ba2d..dc44f06 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -22,11 +22,12 @@ static void bch_bi_idx_hack_endio(struct bio *bio, int 
error)
 static void bch_generic_make_request_hack(struct bio *bio)
 {
if (bio-bi_iter.bi_idx) {
+   int i;
+   struct bio_vec *bv;
struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
 
-   memcpy(clone-bi_io_vec,
-  bio_iovec(bio),
-  bio_segments(bio) * sizeof(struct bio_vec));
+   bio_for_each_segment(bv, bio, i)
+   clone-bi_io_vec[clone-bi_vcnt++] = *bv;
 
clone-bi_iter.bi_sector = bio-bi_iter.bi_sector;
clone-bi_bdev  = bio-bi_bdev;
@@ -97,7 +98,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
if (!ret)
return NULL;
 
-   memcpy(ret-bi_io_vec, bio_iovec(bio),
+   memcpy(ret-bi_io_vec, __bio_iovec(bio),
   sizeof(struct bio_vec) * vcnt);
 
break;
@@ -106,7 +107,7 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
if (!ret)
return NULL;
 
-   memcpy(ret-bi_io_vec, bio_iovec(bio),
+   memcpy(ret-bi_io_vec, __bio_iovec(bio),
   sizeof(struct bio_vec) * vcnt);
 
ret-bi_io_vec[vcnt - 1].bv_len = nbytes;
@@ -182,7 +183,7 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
ret = min(ret, queue_max_sectors(q));
 
WARN_ON(!ret);
-   ret = max_t(int, ret, bio_iovec(bio)-bv_len  9);
+   ret = max_t(int, ret, bio_iovec(bio).bv_len  9);
 
return ret;
 }
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 132b315..5392135 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -524,7 +524,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
io-io_vec = io-io_vec_inline;
else
io-io_vec = mempool_alloc(v-vec_mempool, GFP_NOIO);
-   memcpy(io-io_vec, bio_iovec(bio),
+   memcpy(io-io_vec, __bio_iovec(bio),
   io-io_vec_size * sizeof(struct bio_vec));
 
verity_submit_prefetch(v, io);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e6c4bff..200d6bc 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -801,7 +801,7 @@ static int sd_setup_write_same_cmnd(struct scsi_device 
*sdp, struct request *rq)
if (sdkp-device-no_write_same)
return BLKPREP_KILL;
 
-   BUG_ON(bio_offset(bio) || bio_iovec(bio)-bv_len != sdp-sector_size);
+   BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp-sector_size);
 
sector = ilog2(sdp-sector_size) - 9;
nr_sectors = ilog2(sdp-sector_size) - 9;
diff --git a/fs/bio.c b/fs/bio.c
index a402ad6..7bb281f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -821,12 +821,12 @@ void bio_advance(struct bio *bio, unsigned bytes)
break;
}
 
-   if (bytes = bio_iovec(bio)-bv_len) {
-   bytes -= bio_iovec(bio)-bv_len;
+   if (bytes = bio_iovec(bio).bv_len) {
+   bytes -= bio_iovec(bio).bv_len

[PATCH 01/25] block: submit_bio_wait() conversions

2013-11-26 Thread Kent Overstreet
It was being open coded in a few places.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Joern Engel jo...@logfs.org
Cc: Prasad Joshi prasadjoshi.li...@gmail.com
Cc: Neil Brown ne...@suse.de
Cc: Chris Mason chris.ma...@fusionio.com
Acked-by: NeilBrown ne...@suse.de
---
 block/blk-flush.c  | 19 +--
 drivers/md/md.c| 14 +-
 fs/btrfs/check-integrity.c | 32 +---
 fs/btrfs/check-integrity.h |  2 ++
 fs/btrfs/extent_io.c   | 12 +---
 fs/btrfs/scrub.c   | 33 -
 fs/hfsplus/wrapper.c   | 17 +
 fs/logfs/dev_bdev.c| 13 +
 8 files changed, 24 insertions(+), 118 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 331e627..fb6f3c0 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -502,15 +502,6 @@ void blk_abort_flushes(struct request_queue *q)
}
 }
 
-static void bio_end_flush(struct bio *bio, int err)
-{
-   if (err)
-   clear_bit(BIO_UPTODATE, bio-bi_flags);
-   if (bio-bi_private)
-   complete(bio-bi_private);
-   bio_put(bio);
-}
-
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:  blockdev to issue flush for
@@ -526,7 +517,6 @@ static void bio_end_flush(struct bio *bio, int err)
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
 {
-   DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
int ret = 0;
@@ -548,13 +538,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
return -ENXIO;
 
bio = bio_alloc(gfp_mask, 0);
-   bio-bi_end_io = bio_end_flush;
bio-bi_bdev = bdev;
-   bio-bi_private = wait;
 
-   bio_get(bio);
-   submit_bio(WRITE_FLUSH, bio);
-   wait_for_completion_io(wait);
+   ret = submit_bio_wait(WRITE_FLUSH, bio);
 
/*
 * The driver must store the error location in -bi_sector, if
@@ -564,9 +550,6 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t 
gfp_mask,
if (error_sector)
*error_sector = bio-bi_sector;
 
-   if (!bio_flagged(bio, BIO_UPTODATE))
-   ret = -EIO;
-
bio_put(bio);
return ret;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6b7a28..739b1ec 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -776,20 +776,12 @@ void md_super_wait(struct mddev *mddev)
finish_wait(mddev-sb_wait, wq);
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-   complete((struct completion*)bio-bi_private);
-}
-
 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 struct page *page, int rw, bool metadata_op)
 {
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev-mddev);
-   struct completion event;
int ret;
 
-   rw |= REQ_SYNC;
-
bio-bi_bdev = (metadata_op  rdev-meta_bdev) ?
rdev-meta_bdev : rdev-bdev;
if (metadata_op)
@@ -801,11 +793,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, 
int size,
else
bio-bi_sector = sector + rdev-data_offset;
bio_add_page(bio, page, size, 0);
-   init_completion(event);
-   bio-bi_private = event;
-   bio-bi_end_io = bi_complete;
-   submit_bio(rw, bio);
-   wait_for_completion(event);
+   submit_bio_wait(rw, bio);
 
ret = test_bit(BIO_UPTODATE, bio-bi_flags);
bio_put(bio);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b50764b..131d828 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct 
btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
  struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
for (i = 0; i  num_pages;) {
struct bio *bio;
unsigned int j;
-   DECLARE_COMPLETION_ONSTACK(complete);
 
bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
if (!bio) {
@@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
}
bio-bi_bdev = block_ctx-dev-bdev;
bio-bi_sector = dev_bytenr  9;
-   bio-bi_end_io = btrfsic_complete_bio_end_io;
-   bio-bi_private = complete

[PATCH 03/25] bcache: Kill unaligned bvec hack

2013-11-26 Thread Kent Overstreet
Bcache has a hack to avoid cloning the biovec if it's all full pages -
but with immutable biovecs coming this won't be necessary anymore.

For now, we remove the special case and always clone the bvec array so
that the immutable biovec patches are simpler.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/md/bcache/bcache.h  |  1 -
 drivers/md/bcache/request.c | 37 +++--
 drivers/md/bcache/super.c   |  4 
 3 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4beb55a..6b6fe93 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -279,7 +279,6 @@ struct bcache_device {
unsigned long   sectors_dirty_last;
longsectors_dirty_derivative;
 
-   mempool_t   *unaligned_bvec;
struct bio_set  *bio_split;
 
unsigneddata_csum:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index fbcc851..78bab41 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -606,7 +606,6 @@ struct search {
unsignedinsert_bio_sectors;
 
unsignedrecoverable:1;
-   unsignedunaligned_bvec:1;
unsignedwrite:1;
unsignedread_dirty_data:1;
 
@@ -614,6 +613,7 @@ struct search {
 
struct btree_op op;
struct data_insert_op   iop;
+   struct bio_vec  bv[BIO_MAX_PAGES];
 };
 
 static void bch_cache_read_endio(struct bio *bio, int error)
@@ -759,10 +759,14 @@ static void bio_complete(struct search *s)
 static void do_bio_hook(struct search *s)
 {
struct bio *bio = s-bio.bio;
-   memcpy(bio, s-orig_bio, sizeof(struct bio));
 
+   bio_init(bio);
+   bio-bi_io_vec  = s-bv;
+   bio-bi_max_vecs= BIO_MAX_PAGES;
+   __bio_clone(bio, s-orig_bio);
bio-bi_end_io  = request_endio;
bio-bi_private = s-cl;
+
atomic_set(bio-bi_cnt, 3);
 }
 
@@ -774,9 +778,6 @@ static void search_free(struct closure *cl)
if (s-iop.bio)
bio_put(s-iop.bio);
 
-   if (s-unaligned_bvec)
-   mempool_free(s-bio.bio.bi_io_vec, s-d-unaligned_bvec);
-
closure_debug_destroy(cl);
mempool_free(s, s-d-c-search);
 }
@@ -784,7 +785,6 @@ static void search_free(struct closure *cl)
 static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 {
struct search *s;
-   struct bio_vec *bv;
 
s = mempool_alloc(d-c-search, GFP_NOIO);
memset(s, 0, offsetof(struct search, iop.insert_keys));
@@ -803,15 +803,6 @@ static struct search *search_alloc(struct bio *bio, struct 
bcache_device *d)
s-start_time   = jiffies;
do_bio_hook(s);
 
-   if (bio-bi_size != bio_segments(bio) * PAGE_SIZE) {
-   bv = mempool_alloc(d-unaligned_bvec, GFP_NOIO);
-   memcpy(bv, bio_iovec(bio),
-  sizeof(struct bio_vec) * bio_segments(bio));
-
-   s-bio.bio.bi_io_vec= bv;
-   s-unaligned_bvec   = 1;
-   }
-
return s;
 }
 
@@ -850,26 +841,13 @@ static void cached_dev_read_error(struct closure *cl)
 {
struct search *s = container_of(cl, struct search, cl);
struct bio *bio = s-bio.bio;
-   struct bio_vec *bv;
-   int i;
 
if (s-recoverable) {
/* Retry from the backing device: */
trace_bcache_read_retry(s-orig_bio);
 
s-iop.error = 0;
-   bv = s-bio.bio.bi_io_vec;
do_bio_hook(s);
-   s-bio.bio.bi_io_vec = bv;
-
-   if (!s-unaligned_bvec)
-   bio_for_each_segment(bv, s-orig_bio, i)
-   bv-bv_offset = 0, bv-bv_len = PAGE_SIZE;
-   else
-   memcpy(s-bio.bio.bi_io_vec,
-  bio_iovec(s-orig_bio),
-  sizeof(struct bio_vec) *
-  bio_segments(s-orig_bio));
 
/* XXX: invalidate cache */
 
@@ -905,8 +883,7 @@ static void cached_dev_read_done(struct closure *cl)
s-cache_miss = NULL;
}
 
-   if (verify(dc, s-bio.bio)  s-recoverable 
-   !s-unaligned_bvec  !s-read_dirty_data)
+   if (verify(dc, s-bio.bio)  s-recoverable  !s-read_dirty_data)
bch_data_verify(dc, s-orig_bio);
 
bio_complete(s);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dec15cd..1d9ee67 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -739,8 +739,6 @@ static void bcache_device_free(struct bcache_device *d)
}
 
bio_split_pool_free(d-bio_split_hook);
-   if (d-unaligned_bvec)
-   mempool_destroy(d

[PATCH 18/25] rbd: Refactor bio cloning

2013-11-26 Thread Kent Overstreet
Now that we've got drivers converted to the new immutable bvec
primitives, bio splitting becomes much easier - this is how the new
bio_split() will work. (Someone more familiar with the ceph code could
probably use bio_clone_fast() instead of bio_clone() here).

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Yehuda Sadeh yeh...@inktank.com
Cc: Alex Elder el...@inktank.com
Cc: ceph-de...@vger.kernel.org
---
 drivers/block/rbd.c | 64 ++---
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 20e8ab3..3624368 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1173,73 +1173,13 @@ static struct bio *bio_clone_range(struct bio *bio_src,
unsigned int len,
gfp_t gfpmask)
 {
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   struct bvec_iter end_iter;
-   unsigned int resid;
-   unsigned int voff;
-   unsigned short vcnt;
struct bio *bio;
 
-   /* Handle the easy case for the caller */
-
-   if (!offset  len == bio_src-bi_iter.bi_size)
-   return bio_clone(bio_src, gfpmask);
-
-   if (WARN_ON_ONCE(!len))
-   return NULL;
-   if (WARN_ON_ONCE(len  bio_src-bi_iter.bi_size))
-   return NULL;
-   if (WARN_ON_ONCE(offset  bio_src-bi_iter.bi_size - len))
-   return NULL;
-
-   /* Find first affected segment... */
-
-   resid = offset;
-   bio_for_each_segment(bv, bio_src, iter) {
-   if (resid  bv.bv_len)
-   break;
-   resid -= bv.bv_len;
-   }
-   voff = resid;
-
-   /* ...and the last affected segment */
-
-   resid += len;
-   __bio_for_each_segment(bv, bio_src, end_iter, iter) {
-   if (resid = bv.bv_len)
-   break;
-   resid -= bv.bv_len;
-   }
-   vcnt = end_iter.bi_idx = iter.bi_idx + 1;
-
-   /* Build the clone */
-
-   bio = bio_alloc(gfpmask, (unsigned int) vcnt);
+   bio = bio_clone(bio_src, gfpmask);
if (!bio)
return NULL;/* ENOMEM */
 
-   bio-bi_bdev = bio_src-bi_bdev;
-   bio-bi_iter.bi_sector = bio_src-bi_iter.bi_sector +
-   (offset  SECTOR_SHIFT);
-   bio-bi_rw = bio_src-bi_rw;
-   bio-bi_flags |= 1  BIO_CLONED;
-
-   /*
-* Copy over our part of the bio_vec, then update the first
-* and last (or only) entries.
-*/
-   memcpy(bio-bi_io_vec[0], bio_src-bi_io_vec[iter.bi_idx],
-   vcnt * sizeof (struct bio_vec));
-   bio-bi_io_vec[0].bv_offset += voff;
-   if (vcnt  1) {
-   bio-bi_io_vec[0].bv_len -= voff;
-   bio-bi_io_vec[vcnt - 1].bv_len = resid;
-   } else {
-   bio-bi_io_vec[0].bv_len = len;
-   }
-
-   bio-bi_vcnt = vcnt;
+   bio_advance(bio, offset);
bio-bi_iter.bi_size = len;
 
return bio;
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 20/25] block: Don't save/copy bvec array anymore

2013-11-26 Thread Kent Overstreet
Now that drivers have been converted to the bvec_iter primitives, they
shouldn't be modifying the biovec anymore and thus saving it is
unnecessary - code that was previously making a backup of the bvec array
can now just save bio-bi_iter.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 fs/bio.c | 54 +-
 1 file changed, 13 insertions(+), 41 deletions(-)

diff --git a/fs/bio.c b/fs/bio.c
index 6e42b68..9cff939 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -967,60 +967,33 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 EXPORT_SYMBOL(bio_copy_data);
 
 struct bio_map_data {
-   struct bio_vec *iovecs;
-   struct sg_iovec *sgvecs;
int nr_sgvecs;
int is_our_pages;
+   struct sg_iovec sgvecs[];
 };
 
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
 struct sg_iovec *iov, int iov_count,
 int is_our_pages)
 {
-   memcpy(bmd-iovecs, bio-bi_io_vec, sizeof(struct bio_vec) * 
bio-bi_vcnt);
memcpy(bmd-sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
bmd-nr_sgvecs = iov_count;
bmd-is_our_pages = is_our_pages;
bio-bi_private = bmd;
 }
 
-static void bio_free_map_data(struct bio_map_data *bmd)
-{
-   kfree(bmd-iovecs);
-   kfree(bmd-sgvecs);
-   kfree(bmd);
-}
-
 static struct bio_map_data *bio_alloc_map_data(int nr_segs,
   unsigned int iov_count,
   gfp_t gfp_mask)
 {
-   struct bio_map_data *bmd;
-
if (iov_count  UIO_MAXIOV)
return NULL;
 
-   bmd = kmalloc(sizeof(*bmd), gfp_mask);
-   if (!bmd)
-   return NULL;
-
-   bmd-iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
-   if (!bmd-iovecs) {
-   kfree(bmd);
-   return NULL;
-   }
-
-   bmd-sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
-   if (bmd-sgvecs)
-   return bmd;
-
-   kfree(bmd-iovecs);
-   kfree(bmd);
-   return NULL;
+   return kmalloc(sizeof(struct bio_map_data) +
+  sizeof(struct sg_iovec) * iov_count, gfp_mask);
 }
 
-static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
- struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
  int to_user, int from_user, int do_free_page)
 {
int ret = 0, i;
@@ -1030,7 +1003,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec 
*iovecs,
 
bio_for_each_segment_all(bvec, bio, i) {
char *bv_addr = page_address(bvec-bv_page);
-   unsigned int bv_len = iovecs[i].bv_len;
+   unsigned int bv_len = bvec-bv_len;
 
while (bv_len  iov_idx  iov_count) {
unsigned int bytes;
@@ -1090,14 +1063,14 @@ int bio_uncopy_user(struct bio *bio)
 * don't copy into a random user address space, just free.
 */
if (current-mm)
-   ret = __bio_copy_iov(bio, bmd-iovecs, bmd-sgvecs,
-bmd-nr_sgvecs, bio_data_dir(bio) 
== READ,
+   ret = __bio_copy_iov(bio, bmd-sgvecs, bmd-nr_sgvecs,
+bio_data_dir(bio) == READ,
 0, bmd-is_our_pages);
else if (bmd-is_our_pages)
bio_for_each_segment_all(bvec, bio, i)
__free_page(bvec-bv_page);
}
-   bio_free_map_data(bmd);
+   kfree(bmd);
bio_put(bio);
return ret;
 }
@@ -1211,7 +1184,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 */
if ((!write_to_vm  (!map_data || !map_data-null_mapped)) ||
(map_data  map_data-from_user)) {
-   ret = __bio_copy_iov(bio, bio-bi_io_vec, iov, iov_count, 0, 1, 
0);
+   ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
if (ret)
goto cleanup;
}
@@ -1225,7 +1198,7 @@ cleanup:
 
bio_put(bio);
 out_bmd:
-   bio_free_map_data(bmd);
+   kfree(bmd);
return ERR_PTR(ret);
 }
 
@@ -1542,16 +1515,15 @@ static void bio_copy_kern_endio(struct bio *bio, int 
err)
 
bio_for_each_segment_all(bvec, bio, i) {
char *addr = page_address(bvec-bv_page);
-   int len = bmd-iovecs[i].bv_len;
 
if (read)
-   memcpy(p, addr, len);
+   memcpy(p, addr, bvec-bv_len);
 
__free_page(bvec-bv_page);
-   p += len;
+   p += bvec-bv_len;
}
 
-   bio_free_map_data(bmd);
+   kfree(bmd);
bio_put

[PATCH 24/25] block: Introduce new bio_split()

2013-11-26 Thread Kent Overstreet
The new bio_split() can split arbitrary bios - it's not restricted to
single page bios, like the old bio_split() (previously renamed to
bio_pair_split()). It also has different semantics - it doesn't allocate
a struct bio_pair, leaving it up to the caller to handle completions.

Then convert the existing bio_pair_split() users to the new bio_split()
- and also nvme, which was open coding bio splitting.

(We have to take that BUG_ON() out of bio_integrity_trim() because this
bio_split() needs to use it, and there's no reason it has to be used on
bios marked as cloned; BIO_CLONED doesn't seem to have clearly
documented semantics anyways.)

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Martin K. Petersen martin.peter...@oracle.com
Cc: Matthew Wilcox matthew.r.wil...@intel.com
Cc: Keith Busch keith.bu...@intel.com
Cc: Vishal Verma vishal.l.ve...@intel.com
Cc: Jiri Kosina jkos...@suse.cz
Cc: Neil Brown ne...@suse.de
---
 drivers/block/nvme-core.c   | 106 +++---
 drivers/block/pktcdvd.c | 136 
 drivers/md/bcache/bcache.h  |   1 -
 drivers/md/bcache/io.c  |  82 +-
 drivers/md/bcache/request.c |  12 ++--
 drivers/md/linear.c |  96 +++
 drivers/md/raid0.c  |  77 +
 drivers/md/raid10.c | 113 +++-
 fs/bio.c|  36 
 include/linux/bio.h |  22 +++
 10 files changed, 272 insertions(+), 409 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 5539d29..1f14ac4 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -441,104 +441,19 @@ int nvme_setup_prps(struct nvme_dev *dev, struct 
nvme_common_command *cmd,
return total_len;
 }
 
-struct nvme_bio_pair {
-   struct bio b1, b2, *parent;
-   struct bio_vec *bv1, *bv2;
-   int err;
-   atomic_t cnt;
-};
-
-static void nvme_bio_pair_endio(struct bio *bio, int err)
-{
-   struct nvme_bio_pair *bp = bio-bi_private;
-
-   if (err)
-   bp-err = err;
-
-   if (atomic_dec_and_test(bp-cnt)) {
-   bio_endio(bp-parent, bp-err);
-   kfree(bp-bv1);
-   kfree(bp-bv2);
-   kfree(bp);
-   }
-}
-
-static struct nvme_bio_pair *nvme_bio_split(struct bio *bio, int idx,
-   int len, int offset)
-{
-   struct nvme_bio_pair *bp;
-
-   BUG_ON(len  bio-bi_iter.bi_size);
-   BUG_ON(idx  bio-bi_vcnt);
-
-   bp = kmalloc(sizeof(*bp), GFP_ATOMIC);
-   if (!bp)
-   return NULL;
-   bp-err = 0;
-
-   bp-b1 = *bio;
-   bp-b2 = *bio;
-
-   bp-b1.bi_iter.bi_size = len;
-   bp-b2.bi_iter.bi_size -= len;
-   bp-b1.bi_vcnt = idx;
-   bp-b2.bi_iter.bi_idx = idx;
-   bp-b2.bi_iter.bi_sector += len  9;
-
-   if (offset) {
-   bp-bv1 = kmalloc(bio-bi_max_vecs * sizeof(struct bio_vec),
-   GFP_ATOMIC);
-   if (!bp-bv1)
-   goto split_fail_1;
-
-   bp-bv2 = kmalloc(bio-bi_max_vecs * sizeof(struct bio_vec),
-   GFP_ATOMIC);
-   if (!bp-bv2)
-   goto split_fail_2;
-
-   memcpy(bp-bv1, bio-bi_io_vec,
-   bio-bi_max_vecs * sizeof(struct bio_vec));
-   memcpy(bp-bv2, bio-bi_io_vec,
-   bio-bi_max_vecs * sizeof(struct bio_vec));
-
-   bp-b1.bi_io_vec = bp-bv1;
-   bp-b2.bi_io_vec = bp-bv2;
-   bp-b2.bi_io_vec[idx].bv_offset += offset;
-   bp-b2.bi_io_vec[idx].bv_len -= offset;
-   bp-b1.bi_io_vec[idx].bv_len = offset;
-   bp-b1.bi_vcnt++;
-   } else
-   bp-bv1 = bp-bv2 = NULL;
-
-   bp-b1.bi_private = bp;
-   bp-b2.bi_private = bp;
-
-   bp-b1.bi_end_io = nvme_bio_pair_endio;
-   bp-b2.bi_end_io = nvme_bio_pair_endio;
-
-   bp-parent = bio;
-   atomic_set(bp-cnt, 2);
-
-   return bp;
-
- split_fail_2:
-   kfree(bp-bv1);
- split_fail_1:
-   kfree(bp);
-   return NULL;
-}
-
 static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
-   int idx, int len, int offset)
+int len)
 {
-   struct nvme_bio_pair *bp = nvme_bio_split(bio, idx, len, offset);
-   if (!bp)
+   struct bio *split = bio_split(bio, len  9, GFP_ATOMIC, NULL);
+   if (!split)
return -ENOMEM;
 
+   bio_chain(split, bio);
+
if (bio_list_empty(nvmeq-sq_cong))
add_wait_queue(nvmeq-sq_full, nvmeq-sq_cong_wait);
-   bio_list_add(nvmeq-sq_cong, bp-b1

[PATCH 19/25] dm: Refactor for new bio cloning/splitting

2013-11-26 Thread Kent Overstreet
We need to convert the dm code to the new bvec_iter primitives which
respect bi_bvec_done; they also allow us to drastically simplify dm's
bio splitting code.

Also, it's no longer necessary to save/restore the bvec array anymore -
driver conversions for immutable bvecs are done, so drivers should never
be modifying it.

Also kill bio_sector_offset(), dm was the only user and it doesn't make
much sense anymore.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Alasdair Kergon a...@redhat.com
Cc: dm-de...@redhat.com
Reviewed-by: Mike Snitzer snit...@redhat.com
---
 drivers/md/dm-bio-record.h |  25 ---
 drivers/md/dm.c| 174 ++---
 fs/bio.c   |  72 ---
 include/linux/bio.h|   2 -
 4 files changed, 20 insertions(+), 253 deletions(-)

diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
index 4f46e8e..dd36461 100644
--- a/drivers/md/dm-bio-record.h
+++ b/drivers/md/dm-bio-record.h
@@ -17,49 +17,24 @@
  * original bio state.
  */
 
-struct dm_bio_vec_details {
-#if PAGE_SIZE  65536
-   __u16 bv_len;
-   __u16 bv_offset;
-#else
-   unsigned bv_len;
-   unsigned bv_offset;
-#endif
-};
-
 struct dm_bio_details {
struct block_device *bi_bdev;
unsigned long bi_flags;
struct bvec_iter bi_iter;
-   struct dm_bio_vec_details bi_io_vec[BIO_MAX_PAGES];
 };
 
 static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
 {
-   unsigned i;
-
bd-bi_bdev = bio-bi_bdev;
bd-bi_flags = bio-bi_flags;
bd-bi_iter = bio-bi_iter;
-
-   for (i = 0; i  bio-bi_vcnt; i++) {
-   bd-bi_io_vec[i].bv_len = bio-bi_io_vec[i].bv_len;
-   bd-bi_io_vec[i].bv_offset = bio-bi_io_vec[i].bv_offset;
-   }
 }
 
 static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
 {
-   unsigned i;
-
bio-bi_bdev = bd-bi_bdev;
bio-bi_flags = bd-bi_flags;
bio-bi_iter = bd-bi_iter;
-
-   for (i = 0; i  bio-bi_vcnt; i++) {
-   bio-bi_io_vec[i].bv_len = bd-bi_io_vec[i].bv_len;
-   bio-bi_io_vec[i].bv_offset = bd-bi_io_vec[i].bv_offset;
-   }
 }
 
 #endif
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ccd064e..44a2fa6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1155,7 +1155,6 @@ struct clone_info {
struct dm_io *io;
sector_t sector;
sector_t sector_count;
-   unsigned short idx;
 };
 
 static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
@@ -1164,68 +1163,24 @@ static void bio_setup_sector(struct bio *bio, sector_t 
sector, sector_t len)
bio-bi_iter.bi_size = to_bytes(len);
 }
 
-static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short 
bv_count)
-{
-   bio-bi_iter.bi_idx = idx;
-   bio-bi_vcnt = idx + bv_count;
-   bio-bi_flags = ~(1  BIO_SEG_VALID);
-}
-
-static void clone_bio_integrity(struct bio *bio, struct bio *clone,
-   unsigned short idx, unsigned len, unsigned 
offset,
-   unsigned trim)
-{
-   if (!bio_integrity(bio))
-   return;
-
-   bio_integrity_clone(clone, bio, GFP_NOIO);
-
-   if (trim)
-   bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), 
len);
-}
-
-/*
- * Creates a little bio that just does part of a bvec.
- */
-static void clone_split_bio(struct dm_target_io *tio, struct bio *bio,
-   sector_t sector, unsigned short idx,
-   unsigned offset, unsigned len)
-{
-   struct bio *clone = tio-clone;
-   struct bio_vec *bv = bio-bi_io_vec + idx;
-
-   *clone-bi_io_vec = *bv;
-
-   bio_setup_sector(clone, sector, len);
-
-   clone-bi_bdev = bio-bi_bdev;
-   clone-bi_rw = bio-bi_rw;
-   clone-bi_vcnt = 1;
-   clone-bi_io_vec-bv_offset = offset;
-   clone-bi_io_vec-bv_len = clone-bi_iter.bi_size;
-   clone-bi_flags |= 1  BIO_CLONED;
-
-   clone_bio_integrity(bio, clone, idx, len, offset, 1);
-}
-
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
 static void clone_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned short idx,
- unsigned short bv_count, unsigned len)
+ sector_t sector, unsigned len)
 {
struct bio *clone = tio-clone;
-   unsigned trim = 0;
 
-   __bio_clone(clone, bio);
-   bio_setup_sector(clone, sector, len);
-   bio_setup_bv(clone, idx, bv_count);
+   __bio_clone_fast(clone, bio);
+
+   if (bio_integrity(bio))
+   bio_integrity_clone(clone, bio, GFP_NOIO);
+
+   bio_advance(clone, to_bytes(sector - clone-bi_iter.bi_sector));
+   clone-bi_iter.bi_size = to_bytes(len);
 
-   if (idx != bio-bi_iter.bi_idx ||
-   clone-bi_iter.bi_size

[PATCH 22/25] block: Generic bio chaining

2013-11-26 Thread Kent Overstreet
This adds a generic mechanism for chaining bio completions. This is
going to be used for a bio_split() replacement, and it turns out to be
very useful in a fair amount of driver code - a fair number of drivers
were implementing this in their own roundabout ways, often painfully.

Note that this means it's no longer to call bio_endio() more than once
on the same bio! This can cause problems for drivers that save/restore
bi_end_io. Arguably they shouldn't be saving/restoring bi_end_io at all
- in all but the simplest cases they'd be better off just cloning the
bio, and immutable biovecs is making bio cloning cheaper. But for now,
we add a bio_endio_nodec() for these cases.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 drivers/md/bcache/io.c   |  2 +-
 drivers/md/dm-cache-target.c |  6 
 drivers/md/dm-snap.c |  1 +
 drivers/md/dm-thin.c |  8 +++--
 drivers/md/dm-verity.c   |  2 +-
 fs/bio-integrity.c   |  2 +-
 fs/bio.c | 76 
 include/linux/bio.h  |  2 ++
 include/linux/blk_types.h|  2 ++
 9 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 0f0ab65..522f957 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -133,7 +133,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
s-bio-bi_end_io = s-bi_end_io;
s-bio-bi_private = s-bi_private;
-   bio_endio(s-bio, 0);
+   bio_endio_nodec(s-bio, 0);
 
closure_debug_destroy(s-cl);
mempool_free(s, s-p-bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 86f9c83..bf3a206 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -765,6 +765,12 @@ static void writethrough_endio(struct bio *bio, int err)
 
dm_unhook_bio(pb-hook_info, bio);
 
+   /*
+* Must bump bi_remaining to allow bio to complete with
+* restored bi_end_io.
+*/
+   atomic_inc(bio-bi_remaining);
+
if (err) {
bio_endio(bio, err);
return;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 3ded8c7..80b5cab 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1415,6 +1415,7 @@ out:
if (full_bio) {
full_bio-bi_end_io = pe-full_bio_end_io;
full_bio-bi_private = pe-full_bio_private;
+   atomic_inc(full_bio-bi_remaining);
}
free_pending_exception(pe);
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index a654024..1abb4a2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -611,8 +611,10 @@ static void cell_defer_no_holder(struct thin_c *tc, struct 
dm_bio_prison_cell *c
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-   if (m-bio)
+   if (m-bio) {
m-bio-bi_end_io = m-saved_bi_end_io;
+   atomic_inc(m-bio-bi_remaining);
+   }
cell_error(m-tc-pool, m-cell);
list_del(m-list);
mempool_free(m, m-tc-pool-mapping_pool);
@@ -626,8 +628,10 @@ static void process_prepared_mapping(struct 
dm_thin_new_mapping *m)
int r;
 
bio = m-bio;
-   if (bio)
+   if (bio) {
bio-bi_end_io = m-saved_bi_end_io;
+   atomic_inc(bio-bi_remaining);
+   }
 
if (m-err) {
cell_error(pool, m-cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index ac35e95..796007a 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -385,7 +385,7 @@ static void verity_finish_io(struct dm_verity_io *io, int 
error)
bio-bi_end_io = io-orig_bi_end_io;
bio-bi_private = io-orig_bi_private;
 
-   bio_endio(bio, error);
+   bio_endio_nodec(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fed744b..9d547d2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -502,7 +502,7 @@ static void bio_integrity_verify_fn(struct work_struct 
*work)
 
/* Restore original bio completion handler */
bio-bi_end_io = bip-bip_end_io;
-   bio_endio(bio, error);
+   bio_endio_nodec(bio, error);
 }
 
 /**
diff --git a/fs/bio.c b/fs/bio.c
index e6dfa06..b0a16db 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -273,6 +273,7 @@ void bio_init(struct bio *bio)
 {
memset(bio, 0, sizeof(*bio));
bio-bi_flags = 1  BIO_UPTODATE;
+   atomic_set(bio-bi_remaining, 1);
atomic_set(bio-bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -295,9 +296,35 @@ void bio_reset(struct bio *bio)
 
memset(bio, 0, BIO_RESET_BYTES);
bio-bi_flags = flags|(1  BIO_UPTODATE);
+   atomic_set(bio-bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_chain_endio(struct bio *bio, int error)
+{
+   bio_endio(bio

[PATCH 23/25] block: Rename bio_split() - bio_pair_split()

2013-11-26 Thread Kent Overstreet
This is prep work for introducing a more general bio_split().

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: NeilBrown ne...@suse.de
Cc: Alasdair Kergon a...@redhat.com
Cc: Lars Ellenberg lars.ellenb...@linbit.com
Cc: Peter Osterlund pete...@telia.com
Cc: Sage Weil s...@inktank.com
---
 drivers/block/pktcdvd.c | 2 +-
 drivers/md/linear.c | 2 +-
 drivers/md/raid0.c  | 6 +++---
 drivers/md/raid10.c | 2 +-
 fs/bio.c| 4 ++--
 include/linux/bio.h | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index ce986ba..28789b8 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2399,7 +2399,7 @@ static void pkt_make_request(struct request_queue *q, 
struct bio *bio)
if (last_zone != zone) {
BUG_ON(last_zone != zone + pd-settings.size);
first_sectors = last_zone - bio-bi_iter.bi_sector;
-   bp = bio_split(bio, first_sectors);
+   bp = bio_pair_split(bio, first_sectors);
BUG_ON(!bp);
pkt_make_request(q, bp-bio1);
pkt_make_request(q, bp-bio2);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index fb3b0d0..e9b53e9 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -326,7 +326,7 @@ static void linear_make_request(struct mddev *mddev, struct 
bio *bio)
 
rcu_read_unlock();
 
-   bp = bio_split(bio, end_sector - bio-bi_iter.bi_sector);
+   bp = bio_pair_split(bio, end_sector - bio-bi_iter.bi_sector);
 
linear_make_request(mddev, bp-bio1);
linear_make_request(mddev, bp-bio2);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 8ee1a6c..ea754dd 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -534,11 +534,11 @@ static void raid0_make_request(struct mddev *mddev, 
struct bio *bio)
 * refuse to split for us, so we need to split it.
 */
if (likely(is_power_of_2(chunk_sects)))
-   bp = bio_split(bio, chunk_sects - (sector 
+   bp = bio_pair_split(bio, chunk_sects - (sector 
   (chunk_sects-1)));
else
-   bp = bio_split(bio, chunk_sects -
-  sector_div(sector, chunk_sects));
+   bp = bio_pair_split(bio, chunk_sects -
+   sector_div(sector, chunk_sects));
raid0_make_request(mddev, bp-bio1);
raid0_make_request(mddev, bp-bio2);
bio_pair_release(bp);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ac4bfa438c..69c1bc8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1193,7 +1193,7 @@ static void make_request(struct mddev *mddev, struct bio 
* bio)
/* This is a one page bio that upper layers
 * refuse to split for us, so we need to split it.
 */
-   bp = bio_split(bio, chunk_sects -
+   bp = bio_pair_split(bio, chunk_sects -
   (bio-bi_iter.bi_sector  (chunk_sects - 1)));
 
/* Each of these 'make_request' calls will call 'wait_barrier'.
diff --git a/fs/bio.c b/fs/bio.c
index b0a16db..a3e753f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1827,7 +1827,7 @@ static void bio_pair_end_2(struct bio *bi, int err)
 /*
  * split a bio - only worry about a bio with a single page in its iovec
  */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
+struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
 {
struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
 
@@ -1874,7 +1874,7 @@ struct bio_pair *bio_split(struct bio *bi, int 
first_sectors)
 
return bp;
 }
-EXPORT_SYMBOL(bio_split);
+EXPORT_SYMBOL(bio_pair_split);
 
 /**
  * bio_trim - trim a bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 64f5169..aa67af0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -317,7 +317,7 @@ struct bio_pair {
atomic_tcnt;
int error;
 };
-extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
+extern struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors);
 extern void bio_pair_release(struct bio_pair *dbio);
 extern void bio_trim(struct bio *bio, int offset, int size);
 
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 17/25] block: Add bio_clone_fast()

2013-11-26 Thread Kent Overstreet
bio_clone() just got more expensive - however, most users of bio_clone()
don't actually need to modify the biovec. If they aren't modifying the
biovec, and they can guarantee that the original bio isn't freed before
the clone (also true in most cases), we can just point the clone at the
original bio's biovec.

Signed-off-by: Kent Overstreet k...@daterainc.com
---
 drivers/md/bcache/request.c |  8 ++
 fs/bio.c| 60 +
 include/linux/bio.h |  2 ++
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4c0a422..63451c7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -613,7 +613,6 @@ struct search {
 
struct btree_op op;
struct data_insert_op   iop;
-   struct bio_vec  bv[BIO_MAX_PAGES];
 };
 
 static void bch_cache_read_endio(struct bio *bio, int error)
@@ -761,9 +760,7 @@ static void do_bio_hook(struct search *s)
struct bio *bio = s-bio.bio;
 
bio_init(bio);
-   bio-bi_io_vec  = s-bv;
-   bio-bi_max_vecs= BIO_MAX_PAGES;
-   __bio_clone(bio, s-orig_bio);
+   __bio_clone_fast(bio, s-orig_bio);
bio-bi_end_io  = request_endio;
bio-bi_private = s-cl;
 
@@ -1065,8 +1062,7 @@ static void cached_dev_write(struct cached_dev *dc, 
struct search *s)
closure_bio_submit(flush, cl, s-d);
}
} else {
-   s-iop.bio = bio_clone_bioset(bio, GFP_NOIO,
- dc-disk.bio_split);
+   s-iop.bio = bio_clone_fast(bio, GFP_NOIO, dc-disk.bio_split);
 
closure_bio_submit(bio, cl, s-d);
}
diff --git a/fs/bio.c b/fs/bio.c
index 1628917..00dc189 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -549,6 +549,66 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 EXPORT_SYMBOL(__bio_clone);
 
 /**
+ * __bio_clone_fast - clone a bio that shares the original bio's biovec
+ * @bio: destination bio
+ * @bio_src: bio to clone
+ *
+ * Clone a bio. Caller will own the returned bio, but not
+ * the actual data it points to. Reference count of returned
+ * bio will be one.
+ *
+ * Caller must ensure that @bio_src is not freed before @bio.
+ */
+void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+{
+   BUG_ON(bio-bi_pool  BIO_POOL_IDX(bio) != BIO_POOL_NONE);
+
+   /*
+* most users will be overriding -bi_bdev with a new target,
+* so we don't set nor calculate new physical/hw segment counts here
+*/
+   bio-bi_bdev = bio_src-bi_bdev;
+   bio-bi_flags |= 1  BIO_CLONED;
+   bio-bi_rw = bio_src-bi_rw;
+   bio-bi_iter = bio_src-bi_iter;
+   bio-bi_io_vec = bio_src-bi_io_vec;
+}
+EXPORT_SYMBOL(__bio_clone_fast);
+
+/**
+ * bio_clone_fast - clone a bio that shares the original bio's biovec
+ * @bio: bio to clone
+ * @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Like __bio_clone_fast, only also allocates the returned bio
+ */
+struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+{
+   struct bio *b;
+
+   b = bio_alloc_bioset(gfp_mask, 0, bs);
+   if (!b)
+   return NULL;
+
+   __bio_clone_fast(b, bio);
+
+   if (bio_integrity(bio)) {
+   int ret;
+
+   ret = bio_integrity_clone(b, bio, gfp_mask);
+
+   if (ret  0) {
+   bio_put(b);
+   return NULL;
+   }
+   }
+
+   return b;
+}
+EXPORT_SYMBOL(bio_clone_fast);
+
+/**
  * bio_clone_bioset - clone a bio
  * @bio_src: bio to clone
  * @gfp_mask: allocation priority
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1a31f9d..1f83f4a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -328,6 +328,8 @@ extern mempool_t *biovec_create_pool(struct bio_set *bs, 
int pool_entries);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 
+extern void __bio_clone_fast(struct bio *, struct bio *);
+extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
 
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 25/25] block: Kill bio_pair_split()

2013-11-26 Thread Kent Overstreet
Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 fs/bio-integrity.c  | 45 ---
 fs/bio.c| 90 -
 include/linux/bio.h | 30 --
 3 files changed, 165 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 9d547d2..80d972d 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -581,51 +581,6 @@ void bio_integrity_trim(struct bio *bio, unsigned int 
offset,
 EXPORT_SYMBOL(bio_integrity_trim);
 
 /**
- * bio_integrity_split - Split integrity metadata
- * @bio:   Protected bio
- * @bp:Resulting bio_pair
- * @sectors:   Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
-   struct blk_integrity *bi;
-   struct bio_integrity_payload *bip = bio-bi_integrity;
-   unsigned int nr_sectors;
-
-   if (bio_integrity(bio) == 0)
-   return;
-
-   bi = bdev_get_integrity(bio-bi_bdev);
-   BUG_ON(bi == NULL);
-   BUG_ON(bip-bip_vcnt != 1);
-
-   nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-
-   bp-bio1.bi_integrity = bp-bip1;
-   bp-bio2.bi_integrity = bp-bip2;
-
-   bp-iv1 = bip-bip_vec[bip-bip_iter.bi_idx];
-   bp-iv2 = bip-bip_vec[bip-bip_iter.bi_idx];
-
-   bp-bip1.bip_vec = bp-iv1;
-   bp-bip2.bip_vec = bp-iv2;
-
-   bp-iv1.bv_len = sectors * bi-tuple_size;
-   bp-iv2.bv_offset += sectors * bi-tuple_size;
-   bp-iv2.bv_len -= sectors * bi-tuple_size;
-
-   bp-bip1.bip_iter.bi_sector = bio-bi_integrity-bip_iter.bi_sector;
-   bp-bip2.bip_iter.bi_sector =
-   bio-bi_integrity-bip_iter.bi_sector + nr_sectors;
-
-   bp-bip1.bip_vcnt = bp-bip2.bip_vcnt = 1;
-   bp-bip1.bip_iter.bi_idx = bp-bip2.bip_iter.bi_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-
-/**
  * bio_integrity_clone - Callback for cloning bios with integrity metadata
  * @bio:   New bio
  * @bio_src:   Original bio
diff --git a/fs/bio.c b/fs/bio.c
index 7b062be..75c49a3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -38,8 +38,6 @@
  */
 #define BIO_INLINE_VECS4
 
-static mempool_t *bio_split_pool __read_mostly;
-
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
@@ -1829,89 +1827,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
 }
 EXPORT_SYMBOL(bio_split);
 
-void bio_pair_release(struct bio_pair *bp)
-{
-   if (atomic_dec_and_test(bp-cnt)) {
-   struct bio *master = bp-bio1.bi_private;
-
-   bio_endio(master, bp-error);
-   mempool_free(bp, bp-bio2.bi_private);
-   }
-}
-EXPORT_SYMBOL(bio_pair_release);
-
-static void bio_pair_end_1(struct bio *bi, int err)
-{
-   struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
-
-   if (err)
-   bp-error = err;
-
-   bio_pair_release(bp);
-}
-
-static void bio_pair_end_2(struct bio *bi, int err)
-{
-   struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
-
-   if (err)
-   bp-error = err;
-
-   bio_pair_release(bp);
-}
-
-/*
- * split a bio - only worry about a bio with a single page in its iovec
- */
-struct bio_pair *bio_pair_split(struct bio *bi, int first_sectors)
-{
-   struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
-
-   if (!bp)
-   return bp;
-
-   trace_block_split(bdev_get_queue(bi-bi_bdev), bi,
-   bi-bi_iter.bi_sector + first_sectors);
-
-   BUG_ON(bio_multiple_segments(bi));
-   atomic_set(bp-cnt, 3);
-   bp-error = 0;
-   bp-bio1 = *bi;
-   bp-bio2 = *bi;
-   bp-bio2.bi_iter.bi_sector += first_sectors;
-   bp-bio2.bi_iter.bi_size -= first_sectors  9;
-   bp-bio1.bi_iter.bi_size = first_sectors  9;
-
-   if (bi-bi_vcnt != 0) {
-   bp-bv1 = bio_iovec(bi);
-   bp-bv2 = bio_iovec(bi);
-
-   if (bio_is_rw(bi)) {
-   bp-bv2.bv_offset += first_sectors  9;
-   bp-bv2.bv_len -= first_sectors  9;
-   bp-bv1.bv_len = first_sectors  9;
-   }
-
-   bp-bio1.bi_io_vec = bp-bv1;
-   bp-bio2.bi_io_vec = bp-bv2;
-
-   bp-bio1.bi_max_vecs = 1;
-   bp-bio2.bi_max_vecs = 1;
-   }
-
-   bp-bio1.bi_end_io = bio_pair_end_1;
-   bp-bio2.bi_end_io = bio_pair_end_2;
-
-   bp-bio1.bi_private = bi;
-   bp-bio2.bi_private = bio_split_pool;
-
-   if (bio_integrity(bi))
-   bio_integrity_split(bi, bp, first_sectors);
-
-   return bp;
-}
-EXPORT_SYMBOL(bio_pair_split);
-
 /**
  * bio_trim - trim a bio
  * @bio:   bio to trim
@@ -2113,11 +2028,6 @@ static int __init init_bio(void)
if (bioset_integrity_create(fs_bio_set

[PATCH 21/25] block: Remove bi_idx hacks

2013-11-26 Thread Kent Overstreet
Now that drivers have been converted to the new bvec_iter primitives,
there's no need to trim the bvec before we submit it; and we can't trim
it once we start sharing bvecs.

It used to be that passing a partially completed bio (i.e. one with
nonzero bi_idx) to generic_make_request() was a dangerous thing -
various drivers would choke on such things. But with immutable biovecs
and our new bio splitting that shares the biovecs, submitting partially
completed bios has to work (and should work, now that all the drivers
have been completed to the new primitives)

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 drivers/md/bcache/io.c | 47 ++-
 fs/bio.c   | 23 ---
 2 files changed, 2 insertions(+), 68 deletions(-)

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index 6e04f3b..0f0ab65 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -11,49 +11,6 @@
 
 #include linux/blkdev.h
 
-static void bch_bi_idx_hack_endio(struct bio *bio, int error)
-{
-   struct bio *p = bio-bi_private;
-
-   bio_endio(p, error);
-   bio_put(bio);
-}
-
-static void bch_generic_make_request_hack(struct bio *bio)
-{
-   if (bio-bi_iter.bi_idx) {
-   struct bio_vec bv;
-   struct bvec_iter iter;
-   unsigned segs = bio_segments(bio);
-   struct bio *clone = bio_alloc(GFP_NOIO, segs);
-
-   bio_for_each_segment(bv, bio, iter)
-   clone-bi_io_vec[clone-bi_vcnt++] = bv;
-
-   clone-bi_iter.bi_sector = bio-bi_iter.bi_sector;
-   clone-bi_bdev  = bio-bi_bdev;
-   clone-bi_rw= bio-bi_rw;
-   clone-bi_vcnt  = segs;
-   clone-bi_iter.bi_size  = bio-bi_iter.bi_size;
-
-   clone-bi_private   = bio;
-   clone-bi_end_io= bch_bi_idx_hack_endio;
-
-   bio = clone;
-   }
-
-   /*
-* Hack, since drivers that clone bios clone up to bi_max_vecs, but our
-* bios might have had more than that (before we split them per device
-* limitations).
-*
-* To be taken out once immutable bvec stuff is in.
-*/
-   bio-bi_max_vecs = bio-bi_vcnt;
-
-   generic_make_request(bio);
-}
-
 /**
  * bch_bio_split - split a bio
  * @bio:   bio to split
@@ -222,12 +179,12 @@ void bch_generic_make_request(struct bio *bio, struct 
bio_split_pool *p)
n-bi_private   = s-cl;
 
closure_get(s-cl);
-   bch_generic_make_request_hack(n);
+   generic_make_request(n);
} while (n != bio);
 
continue_at(s-cl, bch_bio_submit_split_done, NULL);
 submit:
-   bch_generic_make_request_hack(bio);
+   generic_make_request(bio);
 }
 
 /* Bios with headers */
diff --git a/fs/bio.c b/fs/bio.c
index 9cff939..e6dfa06 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1822,11 +1822,7 @@ void bio_trim(struct bio *bio, int offset, int size)
 {
/* 'bio' is a cloned bio which we need to trim to match
 * the given offset and size.
-* This requires adjusting bi_sector, bi_size, and bi_io_vec
 */
-   int i;
-   struct bio_vec *bvec;
-   int sofar = 0;
 
size = 9;
if (offset == 0  size == bio-bi_iter.bi_size)
@@ -1837,25 +1833,6 @@ void bio_trim(struct bio *bio, int offset, int size)
bio_advance(bio, offset  9);
 
bio-bi_iter.bi_size = size;
-
-   /* avoid any complications with bi_idx being non-zero*/
-   if (bio-bi_iter.bi_idx) {
-   memmove(bio-bi_io_vec, bio-bi_io_vec+bio-bi_iter.bi_idx,
-   (bio-bi_vcnt - bio-bi_iter.bi_idx) *
-   sizeof(struct bio_vec));
-   bio-bi_vcnt -= bio-bi_iter.bi_idx;
-   bio-bi_iter.bi_idx = 0;
-   }
-   /* Make sure vcnt and last bv are not too big */
-   bio_for_each_segment_all(bvec, bio, i) {
-   if (sofar + bvec-bv_len  size)
-   bvec-bv_len = size - sofar;
-   if (bvec-bv_len == 0) {
-   bio-bi_vcnt = i;
-   break;
-   }
-   sofar += bvec-bv_len;
-   }
 }
 EXPORT_SYMBOL_GPL(bio_trim);
 
-- 
1.8.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 14/25] ceph: Convert to immutable biovecs

2013-11-26 Thread Kent Overstreet
Now that we've got a mechanism for immutable biovecs -
bi_iter.bi_bvec_done - we need to convert drivers to use primitives that
respect it instead of using the bvec array directly.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
Cc: Sage Weil s...@inktank.com
Cc: ceph-de...@vger.kernel.org
---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c   | 43 +-
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 7c1420b..091fdb6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -1,6 +1,7 @@
 #ifndef __FS_CEPH_MESSENGER_H
 #define __FS_CEPH_MESSENGER_H
 
+#include linux/blk_types.h
 #include linux/kref.h
 #include linux/mutex.h
 #include linux/net.h
@@ -119,8 +120,7 @@ struct ceph_msg_data_cursor {
 #ifdef CONFIG_BLOCK
struct {/* bio */
struct bio  *bio;   /* bio from list */
-   unsigned intvector_index;   /* vector from bio */
-   unsigned intvector_offset;  /* bytes from vector */
+   struct bvec_iter bvec_iter;
};
 #endif /* CONFIG_BLOCK */
struct {/* pages */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4a5df7b..18c039b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -777,13 +777,12 @@ static void ceph_msg_data_bio_cursor_init(struct 
ceph_msg_data_cursor *cursor,
 
bio = data-bio;
BUG_ON(!bio);
-   BUG_ON(!bio-bi_vcnt);
 
cursor-resid = min(length, data-bio_length);
cursor-bio = bio;
-   cursor-vector_index = 0;
-   cursor-vector_offset = 0;
-   cursor-last_piece = length = bio-bi_io_vec[0].bv_len;
+   cursor-bvec_iter = bio-bi_iter;
+   cursor-last_piece =
+   cursor-resid = bio_iter_len(bio, cursor-bvec_iter);
 }
 
 static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@ -792,71 +791,63 @@ static struct page *ceph_msg_data_bio_next(struct 
ceph_msg_data_cursor *cursor,
 {
struct ceph_msg_data *data = cursor-data;
struct bio *bio;
-   struct bio_vec *bio_vec;
-   unsigned int index;
+   struct bio_vec bio_vec;
 
BUG_ON(data-type != CEPH_MSG_DATA_BIO);
 
bio = cursor-bio;
BUG_ON(!bio);
 
-   index = cursor-vector_index;
-   BUG_ON(index = (unsigned int) bio-bi_vcnt);
+   bio_vec = bio_iter_iovec(bio, cursor-bvec_iter);
 
-   bio_vec = bio-bi_io_vec[index];
-   BUG_ON(cursor-vector_offset = bio_vec-bv_len);
-   *page_offset = (size_t) (bio_vec-bv_offset + cursor-vector_offset);
+   *page_offset = (size_t) bio_vec.bv_offset;
BUG_ON(*page_offset = PAGE_SIZE);
if (cursor-last_piece) /* pagelist offset is always 0 */
*length = cursor-resid;
else
-   *length = (size_t) (bio_vec-bv_len - cursor-vector_offset);
+   *length = (size_t) bio_vec.bv_len;
BUG_ON(*length  cursor-resid);
BUG_ON(*page_offset + *length  PAGE_SIZE);
 
-   return bio_vec-bv_page;
+   return bio_vec.bv_page;
 }
 
 static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
 {
struct bio *bio;
-   struct bio_vec *bio_vec;
-   unsigned int index;
+   struct bio_vec bio_vec;
 
BUG_ON(cursor-data-type != CEPH_MSG_DATA_BIO);
 
bio = cursor-bio;
BUG_ON(!bio);
 
-   index = cursor-vector_index;
-   BUG_ON(index = (unsigned int) bio-bi_vcnt);
-   bio_vec = bio-bi_io_vec[index];
+   bio_vec = bio_iter_iovec(bio, cursor-bvec_iter);
 
/* Advance the cursor offset */
 
BUG_ON(cursor-resid  bytes);
cursor-resid -= bytes;
-   cursor-vector_offset += bytes;
-   if (cursor-vector_offset  bio_vec-bv_len)
+
+   bio_advance_iter(bio, cursor-bvec_iter, bytes);
+
+   if (bytes  bio_vec.bv_len)
return false;   /* more bytes to process in this segment */
-   BUG_ON(cursor-vector_offset != bio_vec-bv_len);
 
/* Move on to the next segment, and possibly the next bio */
 
-   if (++index == (unsigned int) bio-bi_vcnt) {
+   if (!cursor-bvec_iter.bi_size) {
bio = bio-bi_next;
-   index = 0;
+   cursor-bvec_iter = bio-bi_iter;
}
cursor-bio = bio;
-   cursor-vector_index = index;
-   cursor-vector_offset = 0;
 
if (!cursor-last_piece) {
BUG_ON(!cursor-resid);
BUG_ON(!bio);
/* A short read is OK, so use = rather than == */
-   if (cursor-resid = bio-bi_io_vec[index].bv_len)
+   if (cursor-resid = bio_iter_len(bio, cursor

[PATCH 15/25] block: Kill bio_iovec_idx(), __bio_iovec()

2013-11-26 Thread Kent Overstreet
bio_iovec_idx() and __bio_iovec() don't have any valid uses anymore -
previous users have been converted to bio_iovec_iter() or other methods.

__BVEC_END() has to go too - the bvec array can't be used directly for
the last biovec because we might only be using the first portion of it,
we have to iterate over the bvec array with bio_for_each_segment() which
checks against the current value of bi_iter.bi_size.

Signed-off-by: Kent Overstreet k...@daterainc.com
Cc: Jens Axboe ax...@kernel.dk
---
 block/blk-merge.c   | 13 +++--
 include/linux/bio.h | 26 --
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a1ead90..05c17be 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -86,6 +86,9 @@ EXPORT_SYMBOL(blk_recount_segments);
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
   struct bio *nxt)
 {
+   struct bio_vec end_bv, nxt_bv;
+   struct bvec_iter iter;
+
if (!blk_queue_cluster(q))
return 0;
 
@@ -96,14 +99,20 @@ static int blk_phys_contig_segment(struct request_queue *q, 
struct bio *bio,
if (!bio_has_data(bio))
return 1;
 
-   if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+   bio_for_each_segment(end_bv, bio, iter)
+   if (end_bv.bv_len == iter.bi_size)
+   break;
+
+   nxt_bv = bio_iovec(nxt);
+
+   if (!BIOVEC_PHYS_MERGEABLE(end_bv, nxt_bv))
return 0;
 
/*
 * bio and nxt are contiguous in memory; check if the queue allows
 * these two to be merged into one
 */
-   if (BIO_SEG_BOUNDARY(q, bio, nxt))
+   if (BIOVEC_SEG_BOUNDARY(q, end_bv, nxt_bv))
return 1;
 
return 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index aea9896..1a31f9d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -61,9 +61,6 @@
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
  */
-#define bio_iovec_idx(bio, idx)(((bio)-bi_io_vec[(idx)]))
-#define __bio_iovec(bio)   bio_iovec_idx((bio), (bio)-bi_iter.bi_idx)
-
 #define __bvec_iter_bvec(bvec, iter)   ((bvec)[(iter).bi_idx])
 
 #define bvec_iter_page(bvec, iter) \
@@ -162,19 +159,16 @@ static inline void *bio_data(struct bio *bio)
  * permanent PIO fall back, user is probably better off disabling highmem
  * I/O completely on that queue (see ide-dma for example)
  */
-#define __bio_kmap_atomic(bio, idx)\
-   (kmap_atomic(bio_iovec_idx((bio), (idx))-bv_page) +\
-   bio_iovec_idx((bio), (idx))-bv_offset)
+#define __bio_kmap_atomic(bio, iter)   \
+   (kmap_atomic(bio_iter_iovec((bio), (iter)).bv_page) +   \
+   bio_iter_iovec((bio), (iter)).bv_offset)
 
-#define __bio_kunmap_atomic(addr) kunmap_atomic(addr)
+#define __bio_kunmap_atomic(addr)  kunmap_atomic(addr)
 
 /*
  * merge helpers etc
  */
 
-#define __BVEC_END(bio)bio_iovec_idx((bio), (bio)-bi_vcnt - 1)
-#define __BVEC_START(bio)  bio_iovec_idx((bio), (bio)-bi_iter.bi_idx)
-
 /* Default implementation of BIOVEC_PHYS_MERGEABLE */
 #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2)\
((bvec_to_phys((vec1)) + (vec1)-bv_len) == bvec_to_phys((vec2)))
@@ -191,8 +185,6 @@ static inline void *bio_data(struct bio *bio)
(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + 
(b2)-bv_len, queue_segment_boundary((q)))
-#define BIO_SEG_BOUNDARY(q, b1, b2) \
-   BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
@@ -201,9 +193,7 @@ static inline void *bio_data(struct bio *bio)
  * before it got to the driver and the driver won't own all of it
  */
 #define bio_for_each_segment_all(bvl, bio, i)  \
-   for (i = 0; \
-bvl = bio_iovec_idx((bio), (i)), i  (bio)-bi_vcnt;   \
-i++)
+   for (i = 0, bvl = (bio)-bi_io_vec; i  (bio)-bi_vcnt; i++, bvl++)
 
 static inline void bvec_iter_advance(struct bio_vec *bv, struct bvec_iter 
*iter,
 unsigned bytes)
@@ -468,15 +458,15 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned 
long *flags)
 }
 #endif
 
-static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
+static inline char *__bio_kmap_irq(struct bio *bio, struct bvec_iter iter,
   unsigned long *flags)
 {
-   return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
+   return bvec_kmap_irq(bio_iter_iovec(bio, iter), flags);
 }
 #define __bio_kunmap_irq(buf, flags)   bvec_kunmap_irq(buf

<    3   4   5   6   7   8   9   10   11   12   >