date:20170419

[PATCH 01/11] blk: remove bio_set arg from blk_queue_split()

2017-04-19 Thread NeilBrown

blk_queue_split() is always called with the last arg being q->bio_split,
where 'q' is the first arg.

Also blk_queue_split() sometimes uses the passed-in 'bs' and sometimes uses
q->bio_split.

This is inconsistent and unnecessary.  Remove the last arg and always use
q->bio_split inside blk_queue_split()

Signed-off-by: NeilBrown 
---
 block/blk-core.c  |2 +-
 block/blk-merge.c |9 -
 block/blk-mq.c|2 +-
 drivers/block/drbd/drbd_req.c |2 +-
 drivers/block/pktcdvd.c   |2 +-
 drivers/block/ps3vram.c   |2 +-
 drivers/block/rsxx/dev.c  |2 +-
 drivers/block/umem.c  |2 +-
 drivers/block/zram/zram_drv.c |2 +-
 drivers/lightnvm/rrpc.c   |2 +-
 drivers/md/md.c   |2 +-
 drivers/s390/block/dcssblk.c  |2 +-
 drivers/s390/block/xpram.c|2 +-
 include/linux/blkdev.h|3 +--
 14 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 25aea293ee98..f5d64ad75b36 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1662,7 +1662,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
 */
blk_queue_bounce(q, );
 
-   blk_queue_split(q, , q->bio_split);
+   blk_queue_split(q, );
 
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio->bi_error = -EIO;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..d59074556703 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -202,8 +202,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
return do_split ? new : NULL;
 }
 
-void blk_queue_split(struct request_queue *q, struct bio **bio,
-struct bio_set *bs)
+void blk_queue_split(struct request_queue *q, struct bio **bio)
 {
struct bio *split, *res;
unsigned nsegs;
@@ -211,13 +210,13 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio,
switch (bio_op(*bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
-   split = blk_bio_discard_split(q, *bio, bs, );
+   split = blk_bio_discard_split(q, *bio, q->bio_split, );
break;
case REQ_OP_WRITE_ZEROES:
-   split = blk_bio_write_zeroes_split(q, *bio, bs, );
+   split = blk_bio_write_zeroes_split(q, *bio, q->bio_split, 
);
break;
case REQ_OP_WRITE_SAME:
-   split = blk_bio_write_same_split(q, *bio, bs, );
+   split = blk_bio_write_same_split(q, *bio, q->bio_split, );
break;
default:
split = blk_bio_segment_split(q, *bio, q->bio_split, );
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c496692ecc5b..365cb17308e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1549,7 +1549,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
return BLK_QC_T_NONE;
}
 
-   blk_queue_split(q, , q->bio_split);
+   blk_queue_split(q, );
 
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, _count, _queue_rq))
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index b5730e17b455..fa62dd8a4d46 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1557,7 +1557,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, 
struct bio *bio)
struct drbd_device *device = (struct drbd_device *) q->queuedata;
unsigned long start_jif;
 
-   blk_queue_split(q, , q->bio_split);
+   blk_queue_split(q, );
 
start_jif = jiffies;
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..98394d034c29 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2414,7 +2414,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, 
struct bio *bio)
 
blk_queue_bounce(q, );
 
-   blk_queue_split(q, , q->bio_split);
+   blk_queue_split(q, );
 
pd = q->queuedata;
if (!pd) {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 456b4fe21559..48072c0c1010 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -606,7 +606,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue 
*q, struct bio *bio)
 
dev_dbg(>core, "%s\n", __func__);
 
-   blk_queue_split(q, , q->bio_split);
+   blk_queue_split(q, );
 
spin_lock_irq(>lock);
busy = !bio_list_empty(>list);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 9c566364ac9c..01624eaefcba 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -151,7 +151,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, 
struct bio *bio)
struct rsxx_bio_meta *bio_meta;
int st = -EINVAL;
 
-   blk_queue_split(q, , q->bio_split);
+

[PATCH 03/11] blk: use non-rescuing bioset for q->bio_split.

2017-04-19 Thread NeilBrown

A rescuing bioset is only useful if there might be bios from
that same bioset on the bio_list_on_stack queue at a time
when bio_alloc_bioset() is called.  This never applies to
q->bio_split.

Allocations from q->bio_split are only ever made from
blk_queue_split() which is only ever called early in each of
various make_request_fn()s.  The original bio (call this A)
is then passed to generic_make_request() and is placed on
the bio_list_on_stack queue, and the bio that was allocated
from q->bio_split (B) is processed.

The processing of this may cause other bios to be passed to
generic_make_request() or may even cause the bio B itself to
be passed, possible after some prefix has been split off
(using some other bioset).

generic_make_request() now guarantees that all of these bios
(B and dependants) will be fully processed before the tail
of the original bio A gets handled.  None of these early bios
can possible trigger an allocation from the original
q->bio_split as they are either too small to require
splitting or (more likely) are destined for a different queue.

The next time that the original q->bio_split might be used
by this thread is when A is processed again, as it might
still be too big to handle directly.  By this time there
cannot be any other bios allocated from q->bio_split in the
generic_make_request() queue.  So no rescuing will ever be
needed.

Signed-off-by: NeilBrown 
---
 block/blk-core.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 23f20cb84b2f..f5d64ad75b36 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -728,7 +728,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (q->id < 0)
goto fail_q;
 
-   q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
+   q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
if (!q->bio_split)
goto fail_id;

[PATCH 02/11] blk: make the bioset rescue_workqueue optional.

2017-04-19 Thread NeilBrown

This patch converts bioset_create() and
bioset_create_nobvec() to not create a workqueue so
alloctions will never trigger punt_bios_to_rescuer().  It
also introduces bioset_create_rescued() and
bioset_create_nobvec_rescued() which preserve the old
behaviour.

All callers of bioset_create() and bioset_create_nobvec(),
that are inside block device drivers, are converted to the
_rescued() version.

biosets used by filesystems or other top-level users do not
need rescuing as the bio can never be queued behind other
bios.  This includes fs_bio_set, blkdev_dio_pool,
btrfs_bioset, xfs_ioend_bioset, drbd_md_io_bio_set,
and one allocated by target_core_iblock.c.

biosets used by md/raid to not need rescuing as
their usage was recently audited to revised to never
risk deadlock.

It is hoped that most, if not all, of the remaining biosets
can end up being the non-rescued version.

Signed-off-by: NeilBrown 
---
 block/bio.c   |   28 
 block/blk-core.c  |2 +-
 drivers/md/bcache/super.c |4 ++--
 drivers/md/dm-crypt.c |2 +-
 drivers/md/dm-io.c|2 +-
 drivers/md/dm.c   |5 +++--
 include/linux/bio.h   |2 ++
 7 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 888e7801c638..b8e304015dc8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -363,6 +363,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
struct bio_list punt, nopunt;
struct bio *bio;
 
+   if (!WARN_ON_ONCE(!bs->rescue_workqueue))
+   return;
/*
 * In order to guarantee forward progress we must punt only bios that
 * were allocated from this bio_set; otherwise, if there was a bio on
@@ -474,7 +476,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int 
nr_iovecs,
 
if (current->bio_list &&
(!bio_list_empty(>bio_list[0]) ||
-!bio_list_empty(>bio_list[1])))
+!bio_list_empty(>bio_list[1])) &&
+   bs->rescue_workqueue)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
p = mempool_alloc(bs->bio_pool, gfp_mask);
@@ -1923,7 +1926,8 @@ EXPORT_SYMBOL(bioset_free);
 
 static struct bio_set *__bioset_create(unsigned int pool_size,
   unsigned int front_pad,
-  bool create_bvec_pool)
+  bool create_bvec_pool,
+  bool create_rescue_workqueue)
 {
unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
struct bio_set *bs;
@@ -1954,6 +1958,9 @@ static struct bio_set *__bioset_create(unsigned int 
pool_size,
goto bad;
}
 
+   if (!create_rescue_workqueue)
+   return bs;
+
bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
if (!bs->rescue_workqueue)
goto bad;
@@ -1979,10 +1986,16 @@ static struct bio_set *__bioset_create(unsigned int 
pool_size,
  */
 struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-   return __bioset_create(pool_size, front_pad, true);
+   return __bioset_create(pool_size, front_pad, true, false);
 }
 EXPORT_SYMBOL(bioset_create);
 
+struct bio_set *bioset_create_rescued(unsigned int pool_size, unsigned int 
front_pad)
+{
+   return __bioset_create(pool_size, front_pad, true, true);
+}
+EXPORT_SYMBOL(bioset_create_rescued);
+
 /**
  * bioset_create_nobvec  - Create a bio_set without bio_vec mempool
  * @pool_size: Number of bio to cache in the mempool
@@ -1994,10 +2007,17 @@ EXPORT_SYMBOL(bioset_create);
  */
 struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int 
front_pad)
 {
-   return __bioset_create(pool_size, front_pad, false);
+   return __bioset_create(pool_size, front_pad, false, false);
 }
 EXPORT_SYMBOL(bioset_create_nobvec);
 
+struct bio_set *bioset_create_nobvec_rescued(unsigned int pool_size,
+unsigned int front_pad)
+{
+   return __bioset_create(pool_size, front_pad, false, true);
+}
+EXPORT_SYMBOL(bioset_create_nobvec_rescued);
+
 #ifdef CONFIG_BLK_CGROUP
 
 /**
diff --git a/block/blk-core.c b/block/blk-core.c
index f5d64ad75b36..23f20cb84b2f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -728,7 +728,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (q->id < 0)
goto fail_q;
 
-   q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+   q->bio_split = bioset_create_rescued(BIO_POOL_SIZE, 0);
if (!q->bio_split)
goto fail_id;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 85e3f21c2514..6cb30792f0ed 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -786,7 +786,7 @@ static int

[PATCH 04/11] block: Improvements to bounce-buffer handling

2017-04-19 Thread NeilBrown

Since commit 23688bf4f830 ("block: ensure to split after potentially
bouncing a bio") blk_queue_bounce() is called *before*
blk_queue_split().
This means that:
 1/ the comments blk_queue_split() about bounce buffers are
irrelevant, and
 2/ a very large bio (more than BIO_MAX_PAGES) will no longer be
split before it arrives at blk_queue_bounce(), leading to the
possibility that bio_clone_bioset() will fail and a NULL
will be dereferenced.

Separately, blk_queue_bounce() shouldn't use fs_bio_set as the bio
being copied could be from the same set, and this could lead to a
deadlock.

So:
 - allocate 2 private biosets for blk_queue_bounce, one for
   splitting enormous bios and one for cloning bios.
 - add code to split a bio that exceeds BIO_MAX_PAGES.
 - Fix up the comments in blk_queue_split()

Signed-off-by: NeilBrown 
---
 block/blk-merge.c |   14 --
 block/bounce.c|   27 ++-
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index d59074556703..51c84540d3bb 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -117,17 +117,11 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
 * each holds at most BIO_MAX_PAGES bvecs because
 * bio_clone() can fail to allocate big bvecs.
 *
-* It should have been better to apply the limit per
-* request queue in which bio_clone() is involved,
-* instead of globally. The biggest blocker is the
-* bio_clone() in bio bounce.
+* Those drivers which will need to use bio_clone()
+* should tell us in some way.  For now, impose the
+* BIO_MAX_PAGES limit on all queues.
 *
-* If bio is splitted by this reason, we should have
-* allowed to continue bios merging, but don't do
-* that now for making the change simple.
-*
-* TODO: deal with bio bounce's bio_clone() gracefully
-* and convert the global limit into per-queue limit.
+* TODO: handle users of bio_clone() differently.
 */
if (bvecs++ >= BIO_MAX_PAGES)
goto split;
diff --git a/block/bounce.c b/block/bounce.c
index 1cb5dd3a5da1..51fb538b504d 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -26,6 +26,7 @@
 #define POOL_SIZE  64
 #define ISA_POOL_SIZE  16
 
+struct bio_set *bounce_bio_set, *bounce_bio_split;
 static mempool_t *page_pool, *isa_page_pool;
 
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
@@ -40,6 +41,14 @@ static __init int init_emergency_pool(void)
BUG_ON(!page_pool);
pr_info("pool size: %d pages\n", POOL_SIZE);
 
+   bounce_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+   BUG_ON(!bounce_bio_set);
+   if (bioset_integrity_create(bounce_bio_set, BIO_POOL_SIZE))
+   BUG_ON(1);
+
+   bounce_bio_split = bioset_create_nobvec(BIO_POOL_SIZE, 0);
+   BUG_ON(!bounce_bio_split);
+
return 0;
 }
 
@@ -194,7 +203,23 @@ static void __blk_queue_bounce(struct request_queue *q, 
struct bio **bio_orig,
 
return;
 bounce:
-   bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
+   if (bio_segments(*bio_orig) > BIO_MAX_PAGES) {
+   int cnt = 0;
+   int sectors = 0;
+   struct bio_vec bv;
+   struct bvec_iter iter;
+   bio_for_each_segment(bv, *bio_orig, iter) {
+   if (cnt++ < BIO_MAX_PAGES)
+   sectors += bv.bv_len >> 9;
+   else
+   break;
+   }
+   bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
+   bio_chain(bio, *bio_orig);
+   generic_make_request(*bio_orig);
+   *bio_orig = bio;
+   }
+   bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
 
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;

[PATCH 06/11] drbd: use bio_clone_fast() instead of bio_clone()

2017-04-19 Thread NeilBrown

drbd does not modify the bi_io_vec of the cloned bio,
so there is no need to clone that part.  So bio_clone_fast()
is the better choice.
For bio_clone_fast() we need to specify a bio_set.
We could use fs_bio_set, which bio_clone() uses, or
drbd_md_io_bio_set, which drbd uses for metadata, but it is
generally best to avoid sharing bio_sets unless you can
be certain that there are no interdependencies.

So create a new bio_set, drbd_io_bio_set, and use bio_clone_fast().

Signed-off-by: NeilBrown 
---
 drivers/block/drbd/drbd_int.h  |3 +++
 drivers/block/drbd/drbd_main.c |9 +
 drivers/block/drbd/drbd_req.h  |2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d5da45bb03a6..f91982515a6b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1441,6 +1441,9 @@ extern struct bio_set *drbd_md_io_bio_set;
 /* to allocate from that set */
 extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
 
+/* And a bio_set for cloning */
+extern struct bio_set *drbd_io_bio_set;
+
 extern struct mutex resources_mutex;
 
 extern int conn_lowest_minor(struct drbd_connection *connection);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 84455c365f57..0cc50a7ca1c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -128,6 +128,7 @@ mempool_t *drbd_request_mempool;
 mempool_t *drbd_ee_mempool;
 mempool_t *drbd_md_io_page_pool;
 struct bio_set *drbd_md_io_bio_set;
+struct bio_set *drbd_io_bio_set;
 
 /* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
@@ -2098,6 +2099,8 @@ static void drbd_destroy_mempools(void)
 
/* D_ASSERT(device, atomic_read(_pp_vacant)==0); */
 
+   if (drbd_io_bio_set)
+   bioset_free(drbd_io_bio_set);
if (drbd_md_io_bio_set)
bioset_free(drbd_md_io_bio_set);
if (drbd_md_io_page_pool)
@@ -2115,6 +2118,7 @@ static void drbd_destroy_mempools(void)
if (drbd_al_ext_cache)
kmem_cache_destroy(drbd_al_ext_cache);
 
+   drbd_io_bio_set  = NULL;
drbd_md_io_bio_set   = NULL;
drbd_md_io_page_pool = NULL;
drbd_ee_mempool  = NULL;
@@ -2142,6 +2146,7 @@ static int drbd_create_mempools(void)
drbd_pp_pool = NULL;
drbd_md_io_page_pool = NULL;
drbd_md_io_bio_set   = NULL;
+   drbd_io_bio_set  = NULL;
 
/* caches */
drbd_request_cache = kmem_cache_create(
@@ -2165,6 +2170,10 @@ static int drbd_create_mempools(void)
goto Enomem;
 
/* mempools */
+   drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+   if (drbd_io_bio_set == NULL)
+   goto Enomem;
+
drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
if (drbd_md_io_bio_set == NULL)
goto Enomem;
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index eb49e7f2da91..a24e870853eb 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -263,7 +263,7 @@ enum drbd_req_state_bits {
 static inline void drbd_req_make_private_bio(struct drbd_request *req, struct 
bio *bio_src)
 {
struct bio *bio;
-   bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+   bio = bio_clone_fast(bio_src, GFP_NOIO, drbd_io_bio_set); /* XXX cannot 
fail!! */
 
req->private_bio = bio;

[PATCH 07/11] pktcdvd: use bio_clone_fast() instead of bio_clone()

2017-04-19 Thread NeilBrown

pktcdvd doesn't change the bi_io_vec of the clone bio,
so it is more efficient to use bio_clone_fast(), and not clone
the bi_io_vec.
This requires providing a bio_set, and it is safest to
provide a dedicated bio_set rather than sharing
fs_bio_set, which filesytems use.
This new bio_set, pkt_bio_set, can also be use for the bio_split()
call as the two allocations (bio_clone_fast, and bio_split) are
independent, neither can block a bio allocated by the other.

Signed-off-by: NeilBrown 
---
 drivers/block/pktcdvd.c |   12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 98394d034c29..7a437b5b8804 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -98,6 +98,7 @@ static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
 static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
 static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */
 static mempool_t *psd_pool;
+static struct bio_set *pkt_bio_set;
 
 static struct class*class_pktcdvd = NULL;/* /sys/class/pktcdvd */
 static struct dentry   *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd 
*/
@@ -2310,7 +2311,7 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 
 static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
 {
-   struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+   struct bio *cloned_bio = bio_clone_fast(bio, GFP_NOIO, pkt_bio_set);
struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
 
psd->pd = pd;
@@ -2455,7 +2456,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, 
struct bio *bio)
 
split = bio_split(bio, last_zone -
  bio->bi_iter.bi_sector,
- GFP_NOIO, fs_bio_set);
+ GFP_NOIO, pkt_bio_set);
bio_chain(split, bio);
} else {
split = bio;
@@ -2919,6 +2920,11 @@ static int __init pkt_init(void)
sizeof(struct packet_stacked_data));
if (!psd_pool)
return -ENOMEM;
+   pkt_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+   if (!pkt_bio_set) {
+   mempool_destroy(psd_pool);
+   return -ENOMEM;
+   }
 
ret = register_blkdev(pktdev_major, DRIVER_NAME);
if (ret < 0) {
@@ -2951,6 +2957,7 @@ static int __init pkt_init(void)
unregister_blkdev(pktdev_major, DRIVER_NAME);
 out2:
mempool_destroy(psd_pool);
+   bioset_free(pkt_bio_set);
return ret;
 }
 
@@ -2964,6 +2971,7 @@ static void __exit pkt_exit(void)
 
unregister_blkdev(pktdev_major, DRIVER_NAME);
mempool_destroy(psd_pool);
+   bioset_free(pkt_bio_set);
 }
 
 MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");

[PATCH 09/11] bcache: use kmalloc to allocate bio in bch_data_verify()

2017-04-19 Thread NeilBrown

This function allocates a bio, then a collection
of pages.  It copes with failure.

It currently uses a mempool() to allocate the bio,
but alloc_page() to allocate the pages.  These fail
in different ways, so the usage is inconsistent.

Change the bio_clone() to bio_clone_kmalloc()
so that no pool is used either for the bio or the pages.

Signed-off-by: NeilBrown 
---
 drivers/md/bcache/debug.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 06f55056aaae..35a5a7210e51 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,7 +110,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
 
-   check = bio_clone(bio, GFP_NOIO);
+   check = bio_clone_kmalloc(bio, GFP_NOIO);
if (!check)
return;
check->bi_opf = REQ_OP_READ;

[PATCH 11/11] block: don't check for BIO_MAX_PAGES in blk_bio_segment_split()

2017-04-19 Thread NeilBrown

blk_bio_segment_split() makes sure bios have no more than
BIO_MAX_PAGES entries in the bi_io_vec.
This was done because bio_clone_bioset() (when given a
mempool bioset) could not handle larger io_vecs.

No driver uses bio_clone_bioset() any more, they all
use bio_clone_fast() if anything, and bio_clone_fast()
doesn't clone the bi_io_vec.

The main user of of bio_clone_bioset() at this level
is bounce.c, and bouncing now happens before blk_bio_segment_split(),
so that is not of concern.

So remove the big helpful comment and the code.

Signed-off-by: NeilBrown 
---
 block/blk-merge.c |   16 
 1 file changed, 16 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index e7862e9dcc39..cea544ec5d96 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -108,25 +108,9 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
bool do_split = true;
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
-   unsigned bvecs = 0;
 
bio_for_each_segment(bv, bio, iter) {
/*
-* With arbitrary bio size, the incoming bio may be very
-* big. We have to split the bio into small bios so that
-* each holds at most BIO_MAX_PAGES bvecs because
-* bio_clone_bioset() can fail to allocate big bvecs.
-*
-* Those drivers which will need to use bio_clone_bioset()
-* should tell us in some way.  For now, impose the
-* BIO_MAX_PAGES limit on all queues.
-*
-* TODO: handle users of bio_clone_bioset() differently.
-*/
-   if (bvecs++ >= BIO_MAX_PAGES)
-   goto split;
-
-   /*
 * If the queue doesn't support SG gaps and adding this
 * offset would create a gap, disallow it.
 */

Re: [PATCH 2/2] block: Improve error handling verbosity

2017-04-19 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH 1/2] scsi: convert unrecovered read error to -EILSEQ

2017-04-19 Thread Christoph Hellwig

Looks like I won't get the major error status changes into 4.12,
so let's go with these patches for now:

Reviewed-by: Christoph Hellwig

Re: [PATCH 08/23] scsi: introduce a result field in struct scsi_request

2017-04-19 Thread Christoph Hellwig

On Wed, Apr 19, 2017 at 09:43:00PM -0400, Martin K. Petersen wrote:
> Really wish we could just obliterate drivers/ide.

Same here.  It's been _the_ major pain point for doing block layer
heavy lifting for the last 1 or two years.

Re: [PATCH 02/23] block: remove the blk_execute_rq return value

2017-04-19 Thread h...@lst.de

On Wed, Apr 19, 2017 at 09:07:45PM +, Bart Van Assche wrote:
> > +   blk_execute_rq(or->request->q, NULL, or->request, 0);
> > +   error = or->request ? -EIO : 0;
> 
> Hello Christoph,
> 
> Did you perhaps intend or->request->errors instead of or->request?

Yes, thanks.

Re: [PATCH 3/3] mtip32xx: use runtime tag to initialize command header

2017-04-19 Thread Christoph Hellwig

On Wed, Apr 19, 2017 at 08:31:42AM +0800, Ming Lei wrote:
> mtip32xx supposes that 'request_idx' passed to .init_request()
> is tag of the request, and use that as request's tag to initialize
> command header.
> 
> After MQ IO scheduler is in, request tag assigned isn't same with
> the request index anymore, so cause strange hardware failure on
> mtip32xx, even whole system panic is triggered.
> 
> This patch fixes the issue by initializing command header via
> request's real tag.

This patch looks fine:

Reviewed-by: Christoph Hellwig 

But we should also remove the request_idx parameter from the
init_request and exit_request methods given that they can't be
used once a scheduler is in use.

Re: [PATCH 0/4] blk-mq-sched: allow to use hw tag for sched

2017-04-19 Thread Christoph Hellwig

On Thu, Apr 20, 2017 at 09:03:47AM +0800, Ming Lei wrote:
> If we don't need to reserve tag for internal command, I am happy
> to fix it in the interim. However, the mtip32xx driver has to
> ask blk-mq to reserve the tag zero for internal command. Then
> if we can't allow the driver to use that reserved tag, it looks
> quite awkward, doesn't it?

It doesn't.  Just offset the hardware tags by 1 from the blk-mq tags.

But I'm pretty sure the hardware wouldn't require a specific tag
for the internal ops.

Re: [PATCH v3] dcssblk: add dax_operations support

2017-04-19 Thread kbuild test robot

Hi Dan,

[auto build test ERROR on linus/master]
[also build test ERROR on v4.11-rc7 next-20170419]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Dan-Williams/dcssblk-add-dax_operations-support/20170420-090408
config: s390-defconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=s390 

All error/warnings (new ones prefixed by >>):

>> drivers/s390/block/dcssblk.c:36:46: warning: 'struct dax_device' declared 
>> inside parameter list will not be visible outside of this definition or 
>> declaration
static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t 
pgoff,
 ^~
>> drivers/s390/block/dcssblk.c:49:21: error: variable 'dcssblk_dax_ops' has 
>> initializer but incomplete type
static const struct dax_operations dcssblk_dax_ops = {
^~
>> drivers/s390/block/dcssblk.c:50:2: error: unknown field 'direct_access' 
>> specified in initializer
 .direct_access = dcssblk_dax_direct_access,
 ^
>> drivers/s390/block/dcssblk.c:50:19: warning: excess elements in struct 
>> initializer
 .direct_access = dcssblk_dax_direct_access,
  ^
   drivers/s390/block/dcssblk.c:50:19: note: (near initialization for 
'dcssblk_dax_ops')
   drivers/s390/block/dcssblk.c: In function 'dcssblk_shared_store':
>> drivers/s390/block/dcssblk.c:400:2: error: implicit declaration of function 
>> 'kill_dax' [-Werror=implicit-function-declaration]
 kill_dax(dev_info->dax_dev);
 ^~~~
>> drivers/s390/block/dcssblk.c:401:2: error: implicit declaration of function 
>> 'put_dax' [-Werror=implicit-function-declaration]
 put_dax(dev_info->dax_dev);
 ^~~
   drivers/s390/block/dcssblk.c: In function 'dcssblk_add_store':
>> drivers/s390/block/dcssblk.c:667:22: error: implicit declaration of function 
>> 'alloc_dax' [-Werror=implicit-function-declaration]
 dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
 ^
>> drivers/s390/block/dcssblk.c:667:20: warning: assignment makes pointer from 
>> integer without a cast [-Wint-conversion]
 dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
   ^
   drivers/s390/block/dcssblk.c: At top level:
   drivers/s390/block/dcssblk.c:932:1: error: conflicting types for 
'dcssblk_dax_direct_access'
dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
^
   drivers/s390/block/dcssblk.c:36:13: note: previous declaration of 
'dcssblk_dax_direct_access' was here
static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t 
pgoff,
^
   drivers/s390/block/dcssblk.c: In function 'dcssblk_dax_direct_access':
>> drivers/s390/block/dcssblk.c:935:38: error: implicit declaration of function 
>> 'dax_get_private' [-Werror=implicit-function-declaration]
 struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev);
 ^~~
>> drivers/s390/block/dcssblk.c:935:38: warning: initialization makes pointer 
>> from integer without a cast [-Wint-conversion]
   drivers/s390/block/dcssblk.c: At top level:
>> drivers/s390/block/dcssblk.c:49:36: error: storage size of 'dcssblk_dax_ops' 
>> isn't known
static const struct dax_operations dcssblk_dax_ops = {
   ^~~
   drivers/s390/block/dcssblk.c:36:13: warning: 'dcssblk_dax_direct_access' 
used but never defined
static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t 
pgoff,
^
   drivers/s390/block/dcssblk.c:932:1: warning: 'dcssblk_dax_direct_access' 
defined but not used [-Wunused-function]
dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
^
   cc1: some warnings being treated as errors

vim +/dcssblk_dax_ops +49 drivers/s390/block/dcssblk.c

30  static int dcssblk_open(struct block_device *bdev, fmode_t mode);
31  static void dcssblk_release(struct gendisk *disk, fmode_t mode);
32  static blk_qc_t dcssblk_make_request(struct request_queue *q,
33  struct bio *bio);
34  static long dcssblk_blk_direct_access(struct block_device *bdev, 
sector_t secnum,
35   void **kaddr, pfn_t *pf

Re: [PATCH v3] axon_ram: add dax_operations support

2017-04-19 Thread Dan Williams

On Wed, Apr 19, 2017 at 8:01 PM, kbuild test robot <l...@intel.com> wrote:
> Hi Dan,
>
> [auto build test ERROR on powerpc/next]
> [also build test ERROR on v4.11-rc7 next-20170419]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
>
>
> url:
> https://github.com/0day-ci/linux/commits/Dan-Williams/axon_ram-add-dax_operations-support/20170420-091615
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next

Hi kbuild team, yes this is the wrong base. It's part of a larger
series [1] and I'm just re-sending a few select patches with updates
from review, rather than the full 33 patch series. Any better way to
send individual updates to a patch in a series without re-sending the
whole series?

[1]: https://lkml.org/lkml/2017/4/14/495

Re: [PATCH v3] axon_ram: add dax_operations support

2017-04-19 Thread kbuild test robot

Hi Dan,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.11-rc7 next-20170419]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Dan-Williams/axon_ram-add-dax_operations-support/20170420-091615
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/sysdev/axonram.c: In function 'axon_ram_dax_direct_access':
>> arch/powerpc/sysdev/axonram.c:176:31: error: implicit declaration of 
>> function 'dax_get_private' [-Werror=implicit-function-declaration]
 struct axon_ram_bank *bank = dax_get_private(dax_dev);
  ^~~
>> arch/powerpc/sysdev/axonram.c:176:31: error: initialization makes pointer 
>> from integer without a cast [-Werror=int-conversion]
   arch/powerpc/sysdev/axonram.c: At top level:
>> arch/powerpc/sysdev/axonram.c:181:21: error: variable 'axon_ram_dax_ops' has 
>> initializer but incomplete type
static const struct dax_operations axon_ram_dax_ops = {
^~
>> arch/powerpc/sysdev/axonram.c:182:2: error: unknown field 'direct_access' 
>> specified in initializer
 .direct_access = axon_ram_dax_direct_access,
 ^
   arch/powerpc/sysdev/axonram.c:182:19: error: excess elements in struct 
initializer [-Werror]
 .direct_access = axon_ram_dax_direct_access,
  ^~
   arch/powerpc/sysdev/axonram.c:182:19: note: (near initialization for 
'axon_ram_dax_ops')
   arch/powerpc/sysdev/axonram.c: In function 'axon_ram_probe':
>> arch/powerpc/sysdev/axonram.c:255:18: error: implicit declaration of 
>> function 'alloc_dax' [-Werror=implicit-function-declaration]
 bank->dax_dev = alloc_dax(bank, bank->disk->disk_name,
 ^
>> arch/powerpc/sysdev/axonram.c:255:16: error: assignment makes pointer from 
>> integer without a cast [-Werror=int-conversion]
 bank->dax_dev = alloc_dax(bank, bank->disk->disk_name,
   ^
>> arch/powerpc/sysdev/axonram.c:313:3: error: implicit declaration of function 
>> 'kill_dax' [-Werror=implicit-function-declaration]
  kill_dax(bank->dax_dev);
  ^~~~
>> arch/powerpc/sysdev/axonram.c:314:3: error: implicit declaration of function 
>> 'put_dax' [-Werror=implicit-function-declaration]
  put_dax(bank->dax_dev);
  ^~~
   arch/powerpc/sysdev/axonram.c: At top level:
>> arch/powerpc/sysdev/axonram.c:181:36: error: storage size of 
>> 'axon_ram_dax_ops' isn't known
static const struct dax_operations axon_ram_dax_ops = {
   ^~~~
   cc1: all warnings being treated as errors

vim +/dax_get_private +176 arch/powerpc/sysdev/axonram.c

   170  };
   171  
   172  static long
   173  axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, 
long nr_pages,
   174 void **kaddr, pfn_t *pfn)
   175  {
 > 176  struct axon_ram_bank *bank = dax_get_private(dax_dev);
   177  
   178  return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, 
pfn);
   179  }
   180  
 > 181  static const struct dax_operations axon_ram_dax_ops = {
 > 182  .direct_access = axon_ram_dax_direct_access,
   183  };
   184  
   185  /**
   186   * axon_ram_probe - probe() method for platform driver
   187   * @device: see platform_driver method
   188   */
   189  static int axon_ram_probe(struct platform_device *device)
   190  {
   191  static int axon_ram_bank_id = -1;
   192  struct axon_ram_bank *bank;
   193  struct resource resource;
   194  int rc = 0;
   195  
   196  axon_ram_bank_id++;
   197  
   198  dev_info(>dev, "Found memory controller on %s\n",
   199  device->dev.of_node->full_name);
   200  
   201  bank = kzalloc(sizeof(struct axon_ram_bank), GFP_KERNEL);
   202  if (bank == NULL) {
   203  dev_err(>dev, "Out of memory\n");
   204  rc = -ENOMEM;
   205  goto failed;
   206  }
   207  
   208  device->dev.platform_data = bank;
   209  
   210  bank->device = device;
   211  
   212  if (of_address_to_resource(device->dev.of_node, 0, ) 
!= 0) {
   213  dev_err(>dev, &quo

Re: [PATCH 8/9] bio: add bvec_iter rewind API

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

Dmitry,

> Some ->bi_end_io handlers (for example: pi_verify or decrypt handlers)
> need to know original data vector, but after bio traverse io-stack it
> may be advanced, splited and relocated many times so it is hard to
> guess original iterator. Let's add 'bi_done' conter which accounts
> number of bytes iterator was advanced during it's evolution. Later
> end_io handler may easily restore original iterator by rewinding
> iterator to iter->bi_done.

Originally, the original integrity bip was immutable. Any slicing and
dicing was exclusively done on clones. And therefore the original bip
was always suitable for use with verify.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 6/9] T10: Move opencoded contants to common header

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

Dmitry,

> diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
> index 9fba9dd..f892442 100644
> --- a/include/linux/t10-pi.h
> +++ b/include/linux/t10-pi.h
> @@ -33,6 +33,8 @@ struct t10_pi_tuple {
>   __be32 ref_tag; /* Target LBA or indirect LBA */
>  };
>  
> +#define T10_APP_ESCAPE cpu_to_be16(0x)
> +#define T10_REF_ESCAPE cpu_to_be32(0x)

T10 just means SCSI and does not imply protection information. Please
make these T10_PI_{REF,APP}_ESCAPE (ideally in an enum).

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 5/9] bio-integrity: fold bio_integrity_enabled to bio_integrity_prep

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

> Currently all integrity prep hooks are open-coded, and if prepare
> fails we ignore it's code and fail bio with EIO. Let's return real
> error to upper layer, so later caller may react accordingly.
>
> In fact no one want to use bio_integrity_prep() w/o
> bio_integrity_enabled, so it is reasonable to fold it in to one
> function.

Agree with Christoph's comments. Otherwise OK.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 4/9] bio-integrity: fix interface for bio_integrity_trim v2

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

> bio_integrity_trim inherent it's interface from bio_trim and accept
> offset and size, but this API is error prone because data offset must
> always be insync with bio's data offset. That is why we have integrity
> update hook in bio_advance()
>
> So only meaningful values are: offset == 0, sectors ==
> bio_sectors(bio) Let's just remove them completely.

This interface predates the iter/immutable stuff. It should be fine to
remove the call arguments now.

Reviewed-by: Martin K. Petersen 

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 3/9] bio-integrity: bio_integrity_advance must update integrity seed

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

> SCSI drivers do care about bip_seed so we must update it accordingly.

> + bip->bip_iter.bi_sector += bytes_done >> 9;

This needs to count protection intervals. Otherwise things will break
for block sizes different from 512 bytes.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 2/9] bio-integrity: bio_trim should truncate integrity vector accordingly

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

Reviewed-by: Martin K. Petersen 

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 1/9] bio-integrity: Do not allocate integrity context for bio w/o data

2017-04-19 Thread Martin K. Petersen

Dmitry Monakhov  writes:

> If bio has no data, such as ones from blkdev_issue_flush(),
> then we have nothing to protect.

Reviewed-by: Martin K. Petersen 

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH] block: get rid of blk_integrity_revalidate()

2017-04-19 Thread Martin K. Petersen

Ilya Dryomov  writes:

Ilya,

> Commit 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk")
> introduced blk_integrity_revalidate(), which seems to assume ownership
> of the stable pages flag and unilaterally clears it if no blk_integrity
> profile is registered:
>
> if (bi->profile)
> disk->queue->backing_dev_info->capabilities |=
> BDI_CAP_STABLE_WRITES;
> else
> disk->queue->backing_dev_info->capabilities &=
> ~BDI_CAP_STABLE_WRITES;
>
> It's called from revalidate_disk() and rescan_partitions(), making it
> impossible to enable stable pages for drivers that support partitions
> and don't use blk_integrity: while the call in revalidate_disk() can be
> trivially worked around (see zram, which doesn't support partitions and
> hence gets away with zram_revalidate_disk()), rescan_partitions() can
> be triggered from userspace at any time.  This breaks rbd, where the
> ceph messenger is responsible for generating/verifying CRCs.
>
> Since blk_integrity_{un,}register() "must" be used for (un)registering
> the integrity profile with the block layer, move BDI_CAP_STABLE_WRITES
> setting there.  This way drivers that call blk_integrity_register() and
> use integrity infrastructure won't interfere with drivers that don't
> but still want stable pages.

I seem to recall that the reason for the revalidate hook was that either
NVMe or nvdimm had to register an integrity profile prior to the actual
format being known.

So while I am OK with the change from a SCSI perspective, I think we
need Keith and Dan to ack it.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 26/27] scsi: sd: Separate zeroout and discard command choices

2017-04-19 Thread Martin K. Petersen


Paolo,

> Should this be conditional on lbprz, lbpws, lbpws10 and max_ws_blocks?

It is intentional that things can be overridden from userland for
devices that report the "wrong" thing. We do the same for discard so
people can set up udev rules.

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH 0/4] blk-mq-sched: allow to use hw tag for sched

2017-04-19 Thread Ming Lei

On Wed, Apr 19, 2017 at 06:55:30PM -0600, Jens Axboe wrote:
> On 04/19/2017 06:44 PM, Ming Lei wrote:
> > On Wed, Apr 19, 2017 at 02:17:45PM -0600, Jens Axboe wrote:
> >> On 04/15/2017 06:38 AM, Ming Lei wrote:
> >>> The 1st patch enhances BLK_MQ_F_NO_SCHED so that we can't change/
> >>> show available io schedulers on devices which don't support io
> >>> scheduler.
> >>>
> >>> The 2nd patch passes BLK_MQ_F_NO_SCHED for avoiding one regression
> >>> on mtip32xx, which is introduced by blk-mq io scheduler.
> >>
> >> I've added 1-2 for 4.11 for now, and then let's plan on getting
> >> a real fix done for 4.12.
> > 
> > The real fix has been posted out already:
> > 
> > http://marc.info/?t=14925619671=1=2
> > 
> > Could you consider that for 4.12?
> 
> I didn't particularly like that approach. And we are really late
> in the 4.11 cycle, and your first two patches fit the bill for
> how we should fix it in the interim.

If we don't need to reserve tag for internal command, I am happy
to fix it in the interim. However, the mtip32xx driver has to
ask blk-mq to reserve the tag zero for internal command. Then
if we can't allow the driver to use that reserved tag, it looks
quite awkward, doesn't it?

> I'll take a closer look for 4.12, I want to get this fixed
> for real, obviously.

OK, thanks, let's discuss on solutions.

Thanks,
Ming

Re: [PATCH 0/4] blk-mq-sched: allow to use hw tag for sched

2017-04-19 Thread Jens Axboe

On 04/19/2017 06:44 PM, Ming Lei wrote:
> On Wed, Apr 19, 2017 at 02:17:45PM -0600, Jens Axboe wrote:
>> On 04/15/2017 06:38 AM, Ming Lei wrote:
>>> The 1st patch enhances BLK_MQ_F_NO_SCHED so that we can't change/
>>> show available io schedulers on devices which don't support io
>>> scheduler.
>>>
>>> The 2nd patch passes BLK_MQ_F_NO_SCHED for avoiding one regression
>>> on mtip32xx, which is introduced by blk-mq io scheduler.
>>
>> I've added 1-2 for 4.11 for now, and then let's plan on getting
>> a real fix done for 4.12.
> 
> The real fix has been posted out already:
> 
>   http://marc.info/?t=14925619671=1=2
> 
> Could you consider that for 4.12?

I didn't particularly like that approach. And we are really late
in the 4.11 cycle, and your first two patches fit the bill for
how we should fix it in the interim.

I'll take a closer look for 4.12, I want to get this fixed
for real, obviously.

-- 
Jens Axboe

Re: [PATCH 0/4] blk-mq-sched: allow to use hw tag for sched

2017-04-19 Thread Ming Lei

On Wed, Apr 19, 2017 at 02:17:45PM -0600, Jens Axboe wrote:
> On 04/15/2017 06:38 AM, Ming Lei wrote:
> > The 1st patch enhances BLK_MQ_F_NO_SCHED so that we can't change/
> > show available io schedulers on devices which don't support io
> > scheduler.
> > 
> > The 2nd patch passes BLK_MQ_F_NO_SCHED for avoiding one regression
> > on mtip32xx, which is introduced by blk-mq io scheduler.
> 
> I've added 1-2 for 4.11 for now, and then let's plan on getting
> a real fix done for 4.12.

The real fix has been posted out already:

http://marc.info/?t=14925619671=1=2

Could you consider that for 4.12?

Thanks,
Ming

Re: [PATCH] blk-throttle: Suppress a compiler warning

2017-04-19 Thread Jens Axboe

On 04/19/2017 05:55 PM, Bart Van Assche wrote:
> Avoid that the following warning is reported when building with
> W=1 and with CONFIG_BLK_DEV_THROTTLING_LOW=n:
> 
> block/blk-throttle.c: In function ‘blk_throtl_bio’:
> block/blk-throttle.c:2042:6: warning: variable ‘ret’ set but not used 
> [-Wunused-but-set-variable]
>   int ret;
>   ^~~
> 
> Signed-off-by: Bart Van Assche 
> Cc: Shaohua Li 
> ---
>  block/blk-throttle.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index c82bf9b1fe72..9081ed9a5345 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -2059,6 +2059,10 @@ bool blk_throtl_bio(struct request_queue *q, struct 
> blkcg_gq *blkg,
>   if (ret == 0 || ret == -EBUSY)
>   bio->bi_cg_private = tg;
>   blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
> +#else
> + /* Avoid that the compiler complains about not using ret */
> + if (ret) {
> + }
>  #endif

Ugh, that may get rid of the warning, but it does not help on
the readability or the ifdefs. How about something like the below?
Naming could probably be improved...


diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c82bf9b1fe72..b78db2e5fdff 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2030,6 +2030,20 @@ static inline void throtl_update_latency_buckets(struct 
throtl_data *td)
 }
 #endif
 
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+   int ret;
+
+   ret = bio_associate_current(bio);
+   if (ret == 0 || ret == -EBUSY)
+   bio->bi_cg_private = tg;
+   blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
+#else
+   bio_associate_current(bio);
+#endif
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
 {
@@ -2039,7 +2053,6 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
bool rw = bio_data_dir(bio);
bool throttled = false;
struct throtl_data *td = tg->td;
-   int ret;
 
WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -2054,12 +2067,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
 
-   ret = bio_associate_current(bio);
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-   if (ret == 0 || ret == -EBUSY)
-   bio->bi_cg_private = tg;
-   blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
-#endif
+   blk_throtl_assoc_bio(tg, bio);
blk_throtl_update_idletime(tg);
 
sq = >service_queue;

-- 
Jens Axboe

[PATCH] blk-throttle: Suppress a compiler warning

2017-04-19 Thread Bart Van Assche

Avoid that the following warning is reported when building with
W=1 and with CONFIG_BLK_DEV_THROTTLING_LOW=n:

block/blk-throttle.c: In function ‘blk_throtl_bio’:
block/blk-throttle.c:2042:6: warning: variable ‘ret’ set but not used 
[-Wunused-but-set-variable]
  int ret;
  ^~~

Signed-off-by: Bart Van Assche 
Cc: Shaohua Li 
---
 block/blk-throttle.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c82bf9b1fe72..9081ed9a5345 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2059,6 +2059,10 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
if (ret == 0 || ret == -EBUSY)
bio->bi_cg_private = tg;
blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
+#else
+   /* Avoid that the compiler complains about not using ret */
+   if (ret) {
+   }
 #endif
blk_throtl_update_idletime(tg);
 
-- 
2.12.2

Re: [PATCH v2 0/5] Reduce code duplication

2017-04-19 Thread Jens Axboe

On 04/19/2017 03:01 PM, Bart Van Assche wrote:
> Hello Jens,
> 
> This series contains three patches intended to reduce code duplication
> and two patches that simplify / optimize I/O priority handling. Please
> consider these patches for kernel v4.12.

Applied, thanks Bart.

-- 
Jens Axboe

Re: [PATCH v3 8/8] scsi: Implement blk_mq_ops.show_rq()

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 19:25 -0400, Martin K. Petersen wrote:
> Bart Van Assche  writes:
> > Show the SCSI CDB, .eh_eflags and .result for pending SCSI commands
> > in /sys/kernel/debug/block/*/mq/*/dispatch and */rq_list.
> 
> SCSI_EH_CANCEL_CMD is no more (as of a06586325f37).

Thanks Martin. I will remove that flag when I repost this patch series.

Bart.

Re: [PATCH v3 8/8] scsi: Implement blk_mq_ops.show_rq()

2017-04-19 Thread Martin K. Petersen

Bart Van Assche  writes:

Bart,

> Show the SCSI CDB, .eh_eflags and .result for pending SCSI commands
> in /sys/kernel/debug/block/*/mq/*/dispatch and */rq_list.

SCSI_EH_CANCEL_CMD is no more (as of a06586325f37).

-- 
Martin K. Petersen  Oracle Linux Engineering

[PATCH v3] axon_ram: add dax_operations support

2017-04-19 Thread Dan Williams

Setup a dax_device to have the same lifetime as the axon_ram block
device and add a ->direct_access() method that is equivalent to
axon_ram_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old axon_ram_direct_access() will be removed.

Reported-by: Gerald Schaefer 
Signed-off-by: Dan Williams 
---
Changes since v2:
* fix return code in the alloc_dax() failure case (Gerald)

 arch/powerpc/platforms/Kconfig |1 +
 arch/powerpc/sysdev/axonram.c  |   48 +++-
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 7e3a2ebba29b..33244e3d9375 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -284,6 +284,7 @@ config CPM2
 config AXON_RAM
tristate "Axon DDR2 memory device driver"
depends on PPC_IBM_CELL_BLADE && BLOCK
+   select DAX
default m
help
  It registers one block device per Axon's DDR2 memory bank found
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index f523ac883150..171ba86a3494 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -25,6 +25,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -62,6 +63,7 @@ static int azfs_major, azfs_minor;
 struct axon_ram_bank {
struct platform_device  *device;
struct gendisk  *disk;
+   struct dax_device   *dax_dev;
unsigned intirq_id;
unsigned long   ph_addr;
unsigned long   io_addr;
@@ -137,25 +139,47 @@ axon_ram_make_request(struct request_queue *queue, struct 
bio *bio)
return BLK_QC_T_NONE;
 }
 
+static long
+__axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long 
nr_pages,
+  void **kaddr, pfn_t *pfn)
+{
+   resource_size_t offset = pgoff * PAGE_SIZE;
+
+   *kaddr = (void *) bank->io_addr + offset;
+   *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
+   return (bank->size - offset) / PAGE_SIZE;
+}
+
 /**
  * axon_ram_direct_access - direct_access() method for block device
  * @device, @sector, @data: see block_device_operations method
  */
 static long
-axon_ram_direct_access(struct block_device *device, sector_t sector,
+axon_ram_blk_direct_access(struct block_device *device, sector_t sector,
   void **kaddr, pfn_t *pfn, long size)
 {
struct axon_ram_bank *bank = device->bd_disk->private_data;
-   loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
 
-   *kaddr = (void *) bank->io_addr + offset;
-   *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV);
-   return bank->size - offset;
+   return __axon_ram_direct_access(bank, (sector * 512) / PAGE_SIZE,
+   size / PAGE_SIZE, kaddr, pfn) * PAGE_SIZE;
 }
 
 static const struct block_device_operations axon_ram_devops = {
.owner  = THIS_MODULE,
-   .direct_access  = axon_ram_direct_access
+   .direct_access  = axon_ram_blk_direct_access
+};
+
+static long
+axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long 
nr_pages,
+  void **kaddr, pfn_t *pfn)
+{
+   struct axon_ram_bank *bank = dax_get_private(dax_dev);
+
+   return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn);
+}
+
+static const struct dax_operations axon_ram_dax_ops = {
+   .direct_access = axon_ram_dax_direct_access,
 };
 
 /**
@@ -219,6 +243,7 @@ static int axon_ram_probe(struct platform_device *device)
goto failed;
}
 
+
bank->disk->major = azfs_major;
bank->disk->first_minor = azfs_minor;
bank->disk->fops = _ram_devops;
@@ -227,6 +252,13 @@ static int axon_ram_probe(struct platform_device *device)
sprintf(bank->disk->disk_name, "%s%d",
AXON_RAM_DEVICE_NAME, axon_ram_bank_id);
 
+   bank->dax_dev = alloc_dax(bank, bank->disk->disk_name,
+   _ram_dax_ops);
+   if (!bank->dax_dev) {
+   rc = -ENOMEM;
+   goto failed;
+   }
+
bank->disk->queue = blk_alloc_queue(GFP_KERNEL);
if (bank->disk->queue == NULL) {
dev_err(>dev, "Cannot register disk queue\n");
@@ -278,6 +310,8 @@ static int axon_ram_probe(struct platform_device *device)
del_gendisk(bank->disk);
put_disk(bank->disk);
}
+   kill_dax(bank->dax_dev);
+   put_dax(bank->dax_dev);
device->dev.platform_data = NULL;
if (bank->io_addr != 0)
iounmap((void __iomem *) bank->io_addr);
@@ -300,6 +334,8 @@ axon_ram_remove(struct platform_device *device)
 
device_remove_file(>dev, _attr_ecc);
free_irq(bank->irq_id,

[PATCH v3] dax: add a facility to lookup a dax device by 'host' device name

2017-04-19 Thread Dan Williams

For the current block_device based filesystem-dax path, we need a way
for it to lookup the dax_device associated with a block_device. Add a
'host' property of a dax_device that can be used for this purpose. It is
a free form string, but for a dax_device associated with a block device
it is the bdev name.

This is a stop-gap until filesystems are able to mount on a dax-inode
directly.

Signed-off-by: Dan Williams 
---
Changes since v2:
* fixed slab corruption due to non-null dax_dev->host at
  dax_i_callback() time

 drivers/dax/dax.h|2 +
 drivers/dax/device.c |2 +
 drivers/dax/super.c  |   87 --
 include/linux/dax.h  |1 +
 4 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 2472d9da96db..246a24d68d4c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,7 +13,7 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_device *alloc_dax(void *private);
+struct dax_device *alloc_dax(void *private, const char *host);
 void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 19a42edbfa03..db68f4fa8ce0 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -645,7 +645,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region 
*dax_region,
goto err_id;
}
 
-   dax_dev = alloc_dax(dev_dax);
+   dax_dev = alloc_dax(dev_dax, NULL);
if (!dax_dev)
goto err_dax;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c9f85f1c086e..8d446674c1da 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -30,6 +30,10 @@ static DEFINE_IDA(dax_minor_ida);
 static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 
+#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
+static struct hlist_head dax_host_list[DAX_HASH_SIZE];
+static DEFINE_SPINLOCK(dax_host_lock);
+
 int dax_read_lock(void)
 {
return srcu_read_lock(_srcu);
@@ -46,12 +50,15 @@ EXPORT_SYMBOL_GPL(dax_read_unlock);
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
  * @cdev: optional character interface for "device dax"
+ * @host: optional name for lookups where the device path is not available
  * @private: dax driver private data
  * @alive: !alive + rcu grace period == no new operations / mappings
  */
 struct dax_device {
+   struct hlist_node list;
struct inode inode;
struct cdev cdev;
+   const char *host;
void *private;
bool alive;
 };
@@ -63,6 +70,11 @@ bool dax_alive(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_alive);
 
+static int dax_host_hash(const char *host)
+{
+   return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
+}
+
 /*
  * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
  * that any fault handlers or operations that might have seen
@@ -75,7 +87,13 @@ void kill_dax(struct dax_device *dax_dev)
return;
 
dax_dev->alive = false;
+
synchronize_srcu(_srcu);
+
+   spin_lock(_host_lock);
+   hlist_del_init(_dev->list);
+   spin_unlock(_host_lock);
+
dax_dev->private = NULL;
 }
 EXPORT_SYMBOL_GPL(kill_dax);
@@ -98,6 +116,8 @@ static void dax_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct dax_device *dax_dev = to_dax_dev(inode);
 
+   kfree(dax_dev->host);
+   dax_dev->host = NULL;
ida_simple_remove(_minor_ida, MINOR(inode->i_rdev));
kmem_cache_free(dax_cache, dax_dev);
 }
@@ -169,26 +189,53 @@ static struct dax_device *dax_dev_get(dev_t devt)
return dax_dev;
 }
 
-struct dax_device *alloc_dax(void *private)
+static void dax_add_host(struct dax_device *dax_dev, const char *host)
+{
+   int hash;
+
+   /*
+* Unconditionally init dax_dev since it's coming from a
+* non-zeroed slab cache
+*/
+   INIT_HLIST_NODE(_dev->list);
+   dax_dev->host = host;
+   if (!host)
+   return;
+
+   hash = dax_host_hash(host);
+   spin_lock(_host_lock);
+   hlist_add_head(_dev->list, _host_list[hash]);
+   spin_unlock(_host_lock);
+}
+
+struct dax_device *alloc_dax(void *private, const char *__host)
 {
struct dax_device *dax_dev;
+   const char *host;
dev_t devt;
int minor;
 
+   host = kstrdup(__host, GFP_KERNEL);
+   if (__host && !host)
+   return NULL;
+
minor = ida_simple_get(_minor_ida, 0, nr_dax, GFP_KERNEL);
if (minor < 0)
-   return NULL;
+   goto err_minor;
 
devt = MKDEV(MAJOR(dax_devt), minor);
dax_dev = dax_dev_get(devt);
if (!dax_dev)
-   goto err_inode;
+

[PATCH] ligtnvm: fix double blk_put_queue on same queue

2017-04-19 Thread Rakesh Pandit

On an error path in NVM_DEV_CREATE ioctl blk_put_queue is being called
twice: one via blk_cleanup_queue and another via put_disk.  Straight fix
seems to remove queue pointer so that disk_release never ends up caling
blk_put_queue again.

  [  391.808827] WARNING: CPU: 1 PID: 1250 at lib/refcount.c:128 
refcount_sub_and_test+0x70/0x80
  [  391.808830] refcount_t: underflow; use-after-free.
  [ 391.808832] Modules linked in: nf_conntrack_netbios_ns
  [  391.809052] CPU: 1 PID: 1250 Comm: nvme Not tainted.
  [  391.809057] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
 BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 
04/01/2014
  [  391.809060] Call Trace:
  [  391.809079]  dump_stack+0x63/0x86
  [  391.809094]  __warn+0xcb/0xf0
  [  391.809103]  warn_slowpath_fmt+0x5f/0x80
  [  391.809118]  refcount_sub_and_test+0x70/0x80
  [  391.809125]  refcount_dec_and_test+0x11/0x20
  [  391.809136]  kobject_put+0x1f/0x60
  [  391.809149]  blk_put_queue+0x15/0x20
  [  391.809159]  disk_release+0xae/0xf0
  [  391.809172]  device_release+0x32/0x90
  [  391.809184]  kobject_release+0x6a/0x170
  [  391.809196]  kobject_put+0x2f/0x60
  [  391.809206]  put_disk+0x17/0x20
  [  391.809219]  nvm_ioctl_dev_create.isra.16+0x897/0xa30
  [  391.809236]  nvm_ctl_ioctl+0x23c/0x4c0
  [  391.809248]  do_vfs_ioctl+0xa3/0x5f0
  [  391.809258]  SyS_ioctl+0x79/0x90
  [  391.809271]  entry_SYSCALL_64_fastpath+0x1a/0xa9
  [  391.809280] RIP: 0033:0x7f5d3ef363c7
  [  391.809286] RSP: 002b:7ffc72ed8d78 EFLAGS: 0206 ORIG_RAX: 
0010
  [  391.809296] RAX: ffda RBX: 7ffc72edb552 RCX: 
7f5d3ef363c7
  [  391.809301] RDX: 7ffc72ed8d90 RSI: 40804c22 RDI: 
0003
  [  391.809306] RBP: 0001 R08: 0020 R09: 
0001
  [  391.809311] R10: 053f R11: 0206 R12: 

  [  391.809316] R13:  R14: 7ffc72edb58d R15: 
7ffc72edb581

Signed-off-by: Rakesh Pandit 
---
 drivers/lightnvm/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 2c26af3..5d7aa45 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -309,6 +309,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct 
nvm_ioctl_create *create)
tt->exit(targetdata);
 err_init:
blk_cleanup_queue(tqueue);
+   tdisk->queue = NULL;
 err_disk:
put_disk(tdisk);
 err_dev:
-- 
2.5.5

Re: [PATCH 23/23] block: remove the errors field from struct request

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 21:27 +0200, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig 
> Acked-by: Roger Pau Monné 
> Reviewed-by: Konrad Rzeszutek Wilk 

Reviewed-by: Bart Van Assche

Re: [PATCH 02/23] block: remove the blk_execute_rq return value

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 21:26 +0200, Christoph Hellwig wrote:
> --- a/drivers/scsi/osd/osd_initiator.c
> +++ b/drivers/scsi/osd/osd_initiator.c
> @@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, 
> struct request *req,
>  
>  int osd_execute_request(struct osd_request *or)
>  {
> - int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
> + int error;
> +
> + blk_execute_rq(or->request->q, NULL, or->request, 0);
> + error = or->request ? -EIO : 0;

Hello Christoph,

Did you perhaps intend or->request->errors instead of or->request?

Otherwise this patch looks fine to me.

Bart.

Re: [PATCH 01/23] pd: don't check blk_execute_rq return value.

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 21:26 +0200, Christoph Hellwig wrote:
> The driver never sets req->errors, so blk_execute_rq will always return 0.

Reviewed-by: Bart Van Assche

[PATCH v2 4/5] block: Inline blk_rq_set_prio()

2017-04-19 Thread Bart Van Assche

Since only a single caller remains, inline blk_rq_set_prio(). Initialize
req->ioprio even if no I/O priority has been set in the bio nor in the
I/O context.

Signed-off-by: Bart Van Assche 
Reviewed-by: Adam Manzanares 
Tested-by: Adam Manzanares 
Reviewed-by: Christoph Hellwig 
Cc: Matias Bjørling 
---
 block/blk-core.c   |  7 ++-
 include/linux/blkdev.h | 14 --
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ec2330113bea..4f0fd8f3328d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1630,14 +1630,19 @@ unsigned int blk_plug_queued_count(struct request_queue 
*q)
 
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
+   struct io_context *ioc = rq_ioc(bio);
+
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
 
req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
-   blk_rq_set_prio(req, rq_ioc(bio));
if (ioprio_valid(bio_prio(bio)))
req->ioprio = bio_prio(bio);
+   else if (ioc)
+   req->ioprio = ioc->ioprio;
+   else
+   req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
blk_rq_bio_prep(req->q, req, bio);
 }
 EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index be3abaee935e..92894995f4b7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1088,20 +1088,6 @@ static inline unsigned int blk_rq_count_bios(struct 
request *rq)
 }
 
 /*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
-   if (ioc)
-   rq->ioprio = ioc->ioprio;
-}
-
-/*
  * Request issue related functions.
  */
 extern struct request *blk_peek_request(struct request_queue *q);
-- 
2.12.2

[PATCH v2 5/5] block: Optimize ioprio_best()

2017-04-19 Thread Bart Van Assche

Since ioprio_best() translates IOPRIO_CLASS_NONE into IOPRIO_CLASS_BE
and since lower numerical priority values represent a higher priority
a simple numerical comparison is sufficient.

Signed-off-by: Bart Van Assche 
Reviewed-by: Adam Manzanares 
Tested-by: Adam Manzanares 
Reviewed-by: Christoph Hellwig 
Cc: Matias Bjørling 
---
 block/ioprio.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/block/ioprio.c b/block/ioprio.c
index 0c47a00f92a8..4b120c9cf7e8 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -163,22 +163,12 @@ static int get_task_ioprio(struct task_struct *p)
 
 int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
-   unsigned short aclass;
-   unsigned short bclass;
-
if (!ioprio_valid(aprio))
aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
if (!ioprio_valid(bprio))
bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
 
-   aclass = IOPRIO_PRIO_CLASS(aprio);
-   bclass = IOPRIO_PRIO_CLASS(bprio);
-   if (aclass == bclass)
-   return min(aprio, bprio);
-   if (aclass > bclass)
-   return bprio;
-   else
-   return aprio;
+   return min(aprio, bprio);
 }
 
 SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
-- 
2.12.2

[PATCH v2 0/5] Reduce code duplication

2017-04-19 Thread Bart Van Assche

Hello Jens,

This series contains three patches intended to reduce code duplication
and two patches that simplify / optimize I/O priority handling. Please
consider these patches for kernel v4.12.

Thanks,

Bart.

Changes compared to v1:
- Converted ternary operators into an if/then/else-sequence.
- Used EXPORT_SYMBOL_GPL() to export block layer internals.

Bart Van Assche (5):
  block: Export blk_init_request_from_bio()
  null_blk: Use blk_init_request_from_bio() instead of open-coding it
  lightnvm: Use blk_init_request_from_bio() instead of open-coding it
  block: Inline blk_rq_set_prio()
  block: Optimize ioprio_best()

 block/blk-core.c | 12 +---
 block/blk-mq.c   |  2 +-
 block/blk.h  |  1 -
 block/ioprio.c   | 12 +---
 drivers/block/null_blk.c |  9 +
 drivers/nvme/host/lightnvm.c |  6 +-
 include/linux/blkdev.h   | 15 +--
 7 files changed, 14 insertions(+), 43 deletions(-)

-- 
2.12.2

[PATCH v2 1/5] block: Export blk_init_request_from_bio()

2017-04-19 Thread Bart Van Assche

Export this function such that it becomes available to block
drivers.

Signed-off-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Cc: Matias Bjørling 
Cc: Adam Manzanares 
---
 block/blk-core.c   | 5 +++--
 block/blk-mq.c | 2 +-
 block/blk.h| 1 -
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..ec2330113bea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1628,7 +1628,7 @@ unsigned int blk_plug_queued_count(struct request_queue 
*q)
return ret;
 }
 
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
@@ -1640,6 +1640,7 @@ void init_request_from_bio(struct request *req, struct 
bio *bio)
req->ioprio = bio_prio(bio);
blk_rq_bio_prep(req->q, req, bio);
 }
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
@@ -1730,7 +1731,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
 * We don't worry about that case for efficiency. It won't happen
 * often, and the elevators are able to handle it.
 */
-   init_request_from_bio(req, bio);
+   blk_init_request_from_bio(req, bio);
 
if (test_bit(QUEUE_FLAG_SAME_COMP, >queue_flags))
req->cpu = raw_smp_processor_id();
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2ef7b460924..c496692ecc5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1424,7 +1424,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool 
from_schedule)
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
-   init_request_from_bio(rq, bio);
+   blk_init_request_from_bio(rq, bio);
 
blk_account_io_start(rq, true);
 }
diff --git a/block/blk.h b/block/blk.h
index 07d375183f31..cc8e61cdc229 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,7 +60,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
 void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe9c512cc6fa..be3abaee935e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -924,6 +924,7 @@ extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
-- 
2.12.2

[PATCH v2 3/5] lightnvm: Use blk_init_request_from_bio() instead of open-coding it

2017-04-19 Thread Bart Van Assche

This patch changes the behavior of the lightnvm driver as follows:
* REQ_FAILFAST_MASK is set for read-ahead requests.
* If no I/O priority has been set in the bio, the I/O priority is
  copied from the I/O context.
* The rq_disk member is initialized if bio->bi_bdev != NULL.
* The bio sector offset is copied into req->__sector instead of
  retaining the value -1 set by blk_mq_alloc_request().
* req->errors is initialized to zero.

Signed-off-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Cc: Matias Bjørling 
Cc: Adam Manzanares 
---
 drivers/nvme/host/lightnvm.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 4b78090518e1..b76e2e36fef4 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -511,11 +511,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct 
nvm_rq *rqd)
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
if (bio) {
-   rq->ioprio = bio_prio(bio);
-   rq->__data_len = bio->bi_iter.bi_size;
-   rq->bio = rq->biotail = bio;
-   if (bio_has_data(bio))
-   rq->nr_phys_segments = bio_phys_segments(q, bio);
+   blk_init_request_from_bio(rq, bio);
} else {
rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
rq->__data_len = 0;
-- 
2.12.2

[PATCH v2 2/5] null_blk: Use blk_init_request_from_bio() instead of open-coding it

2017-04-19 Thread Bart Van Assche

This patch changes the behavior of the null_blk driver for the
LightNVM mode as follows:
* REQ_FAILFAST_MASK is set for read-ahead requests.
* If no I/O priority has been set in the bio, the I/O priority is
  copied from the I/O context.
* The rq_disk member is initialized if bio->bi_bdev != NULL.
* req->errors is initialized to zero.

Signed-off-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Cc: Matias Bjørling 
Cc: Adam Manzanares 
---
 drivers/block/null_blk.c | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index f93906ff31e8..e79e3d24e229 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -443,14 +443,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct 
nvm_rq *rqd)
if (IS_ERR(rq))
return -ENOMEM;
 
-   rq->__sector = bio->bi_iter.bi_sector;
-   rq->ioprio = bio_prio(bio);
-
-   if (bio_has_data(bio))
-   rq->nr_phys_segments = bio_phys_segments(q, bio);
-
-   rq->__data_len = bio->bi_iter.bi_size;
-   rq->bio = rq->biotail = bio;
+   blk_init_request_from_bio(rq, bio);
 
rq->end_io_data = rqd;
 
-- 
2.12.2

Re: [PATCH 0/4] blk-mq-sched: allow to use hw tag for sched

2017-04-19 Thread Jens Axboe

On 04/15/2017 06:38 AM, Ming Lei wrote:
> The 1st patch enhances BLK_MQ_F_NO_SCHED so that we can't change/
> show available io schedulers on devices which don't support io
> scheduler.
> 
> The 2nd patch passes BLK_MQ_F_NO_SCHED for avoiding one regression
> on mtip32xx, which is introduced by blk-mq io scheduler.

I've added 1-2 for 4.11 for now, and then let's plan on getting
a real fix done for 4.12.

-- 
Jens Axboe

Re: [PATCH V4 00/16] Introduce the BFQ I/O scheduler

2017-04-19 Thread Jens Axboe

On 04/19/2017 01:05 PM, Paolo Valente wrote:
> 
>> Il giorno 19 apr 2017, alle ore 16:33, Jens Axboe  ha 
>> scritto:
>>
>> On 04/19/2017 03:23 AM, Paolo Valente wrote:
>>>
 Il giorno 12 apr 2017, alle ore 18:23, Paolo Valente 
  ha scritto:

 Hi,
 new patch series, addressing (both) issues raised by Bart [1], and
 with block/Makefile fixed as suggested by Bart [2].

>>>
>>> Hi Jens,
>>> apparently no complain of any sort on this last series.  Do you think
>>> we could make it for 4.12, or shall we aim at 4.13?
>>
>> We may as well queue it up now, I don't think there's much point in
>> deferring an extra cycle.
>>
> 
> A little scary, but (of course) fine for me.

Reaching the culmination of having your big project merged after
a long time is always a little scary. For now, kick back and open
a Tignanello, and brace yourself for tending to potential bug
reports once the 4.12 merge window opens.

-- 
Jens Axboe

[PATCH 23/23] block: remove the errors field from struct request

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
Reviewed-by: Konrad Rzeszutek Wilk 
---
 block/blk-core.c | 14 +-
 block/blk-exec.c |  3 +--
 block/blk-mq.c   | 10 +++---
 block/blk-timeout.c  |  1 -
 include/linux/blkdev.h   |  2 --
 include/trace/events/block.h | 17 +++--
 kernel/trace/blktrace.c  | 26 --
 7 files changed, 24 insertions(+), 49 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..48f522a02f54 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1633,7 +1633,6 @@ void init_request_from_bio(struct request *req, struct 
bio *bio)
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
 
-   req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
blk_rq_set_prio(req, rq_ioc(bio));
if (ioprio_valid(bio_prio(bio)))
@@ -2567,22 +2566,11 @@ bool blk_update_request(struct request *req, int error, 
unsigned int nr_bytes)
 {
int total_bytes;
 
-   trace_block_rq_complete(req->q, req, nr_bytes);
+   trace_block_rq_complete(req, error, nr_bytes);
 
if (!req->bio)
return false;
 
-   /*
-* For fs requests, rq is just carrier of independent bio's
-* and each partial completion should be handled separately.
-* Reset per-request error on each partial completion.
-*
-* TODO: tj: This is too subtle.  It would be better to let
-* low level drivers do what they see fit.
-*/
-   if (!blk_rq_is_passthrough(req))
-   req->errors = 0;
-
if (error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)) {
char *error_type;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index afa383248c7c..a9451e3b8587 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct 
gendisk *bd_disk,
 
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
-   rq->errors = -ENXIO;
-   __blk_end_request_all(rq, rq->errors);
+   __blk_end_request_all(rq, -ENXIO);
spin_unlock_irq(q->queue_lock);
return;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1f98db082da3..f5e3cf153ad7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -213,7 +213,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct 
blk_mq_ctx *ctx,
 #endif
rq->special = NULL;
/* tag was already set */
-   rq->errors = 0;
rq->extra_len = 0;
 
INIT_LIST_HEAD(>timeout_list);
@@ -624,8 +623,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
 
rq = list_first_entry(_list, struct request, queuelist);
list_del_init(>queuelist);
-   rq->errors = -EIO;
-   blk_mq_end_request(rq, rq->errors);
+   blk_mq_end_request(rq, -EIO);
}
 }
 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -1032,8 +1030,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, 
struct list_head *list)
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
errors++;
-   rq->errors = -EIO;
-   blk_mq_end_request(rq, rq->errors);
+   blk_mq_end_request(rq, -EIO);
break;
}
 
@@ -1484,8 +1481,7 @@ static void __blk_mq_try_issue_directly(struct request 
*rq, blk_qc_t *cookie,
 
if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
*cookie = BLK_QC_T_NONE;
-   rq->errors = -EIO;
-   blk_mq_end_request(rq, rq->errors);
+   blk_mq_end_request(rq, -EIO);
return;
}
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..cbff183f3d9f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
ret = q->rq_timed_out_fn(req);
switch (ret) {
case BLK_EH_HANDLED:
-   /* Can we use req->errors here? */
__blk_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e8006abce8b6..b9b958a3bd08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -220,8 +220,6 @@ struct request {
 
void *special;  /* opaque pointer available for LLD use */
 
-   int errors;
-
unsigned int extra_len; /* length of alignment and padding */
 
unsigned long deadline;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 99ed69fad041..d0dbe60d8a6d 100644
---

[PATCH 19/23] floppy: switch from req->errors to req->error_count

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 drivers/block/floppy.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index ce102ec47ef2..60d4c7653178 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2805,8 +2805,10 @@ static int set_next_request(void)
fdc_queue = 0;
if (q) {
current_req = blk_fetch_request(q);
-   if (current_req)
+   if (current_req) {
+   current_req->error_count = 0;
break;
+   }
}
} while (fdc_queue != old_pos);
 
@@ -2866,7 +2868,7 @@ static void redo_fd_request(void)
_floppy = floppy_type + DP->autodetect[DRS->probed_format];
} else
probing = 0;
-   errors = &(current_req->errors);
+   errors = &(current_req->error_count);
tmp = make_raw_rw_request();
if (tmp < 2) {
request_done(tmp);
-- 
2.11.0

[PATCH 12/23] dm mpath: don't check for req->errors

2017-04-19 Thread Christoph Hellwig

We'll get all proper errors reported through ->end_io and ->errors will
go away soon.

Signed-off-by: Christoph Hellwig 
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index ab55955ed704..2950b145443d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1492,7 +1492,7 @@ static int do_end_io(struct multipath *m, struct request 
*clone,
 */
int r = DM_ENDIO_REQUEUE;
 
-   if (!error && !clone->errors)
+   if (!error)
return 0;   /* I/O complete */
 
if (noretry_error(error))
-- 
2.11.0

[PATCH 18/23] block: add a error_count field to struct request

2017-04-19 Thread Christoph Hellwig

This is for the legacy floppy and ataflop drivers that currently abuse
->errors for this purpose.  It's stashed away in a union to not grow
the struct size, the other fields are either used by modern drivers
for different purposes or the I/O scheduler before queing the I/O
to drivers.

Signed-off-by: Christoph Hellwig 
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d8bcd51b8a47..e8006abce8b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -175,6 +175,7 @@ struct request {
struct rb_node rb_node; /* sort/lookup */
struct bio_vec special_vec;
void *completion_data;
+   int error_count; /* for legacy drivers, don't use */
};
 
/*
-- 
2.11.0

[PATCH 16/23] blk-mq: remove the error argument to blk_mq_complete_request

2017-04-19 Thread Christoph Hellwig

Now that all drivers that call blk_mq_complete_requests have a
->complete callback we can remove the direct call to blk_mq_end_request,
as well as the error argument to blk_mq_complete_request.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq.c| 15 +++
 drivers/block/loop.c  |  4 ++--
 drivers/block/mtip32xx/mtip32xx.c |  4 ++--
 drivers/block/nbd.c   |  4 ++--
 drivers/block/null_blk.c  |  2 +-
 drivers/block/virtio_blk.c|  2 +-
 drivers/block/xen-blkfront.c  |  2 +-
 drivers/md/dm-rq.c|  2 +-
 drivers/nvme/host/core.c  |  2 +-
 drivers/nvme/host/nvme.h  |  2 +-
 drivers/scsi/scsi_lib.c   |  2 +-
 include/linux/blk-mq.h|  2 +-
 12 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2ef7b460924..0c19df66f5a0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -442,17 +442,10 @@ static void blk_mq_stat_add(struct request *rq)
 
 static void __blk_mq_complete_request(struct request *rq)
 {
-   struct request_queue *q = rq->q;
-
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
-
blk_mq_stat_add(rq);
-
-   if (!q->softirq_done_fn)
-   blk_mq_end_request(rq, rq->errors);
-   else
-   blk_mq_ipi_complete_request(rq);
+   blk_mq_ipi_complete_request(rq);
 }
 
 /**
@@ -463,16 +456,14 @@ static void __blk_mq_complete_request(struct request *rq)
  * Ends all I/O on a request. It does not handle partial completions.
  * The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
 {
struct request_queue *q = rq->q;
 
if (unlikely(blk_should_fake_timeout(q)))
return;
-   if (!blk_mark_rq_complete(rq)) {
-   rq->errors = error;
+   if (!blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
-   }
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 86351b3f7350..994403efee19 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -465,7 +465,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
cmd->ret = ret;
-   blk_mq_complete_request(cmd->rq, 0);
+   blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -1685,7 +1685,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
/* complete non-aio request */
if (!cmd->use_aio || ret) {
cmd->ret = ret ? -EIO : 0;
-   blk_mq_complete_request(cmd->rq, 0);
+   blk_mq_complete_request(cmd->rq);
}
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 7406de29db58..66a6bd83faae 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -242,7 +242,7 @@ static void mtip_async_complete(struct mtip_port *port,
rq = mtip_rq_from_tag(dd, tag);
 
cmd->status = status;
-   blk_mq_complete_request(rq, 0);
+   blk_mq_complete_request(rq);
 }
 
 /*
@@ -4109,7 +4109,7 @@ static void mtip_no_dev_cleanup(struct request *rq, void 
*data, bool reserv)
 
if (likely(!reserv)) {
cmd->status = -ENODEV;
-   blk_mq_complete_request(rq, 0);
+   blk_mq_complete_request(rq);
} else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, >port->flags)) {
 
cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 09a74a66beb1..d387bef07fcc 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -635,7 +635,7 @@ static void recv_work(struct work_struct *work)
break;
}
 
-   blk_mq_complete_request(blk_mq_rq_from_pdu(cmd), 0);
+   blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
}
atomic_dec(>recv_threads);
wake_up(>recv_wq);
@@ -651,7 +651,7 @@ static void nbd_clear_req(struct request *req, void *data, 
bool reserved)
return;
cmd = blk_mq_rq_to_pdu(req);
cmd->status = -EIO;
-   blk_mq_complete_request(req, 0);
+   blk_mq_complete_request(req);
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 24ca85a70fd8..c27cccec368b 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -281,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
case NULL_IRQ_SOFTIRQ:
switch (queue_mode)  {
case NULL_Q_MQ:
-   blk_mq_complete_request(cmd->rq, 0);
+

[PATCH 15/23] xen-blkfront: don't use req->errors

2017-04-19 Thread Christoph Hellwig

xen-blkfron is the last users using rq->errros for passing back error to
blk-mq, and I'd like to get rid of that.  In the longer run the driver
should be moving more of the completion processing into .complete, but
this is the minimal change to move forward for now.

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
Reviewed-by: Konrad Rzeszutek Wilk 
---
 drivers/block/xen-blkfront.c | 36 +---
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index abed296ce605..57866355c060 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -115,6 +115,15 @@ struct split_bio {
atomic_t pending;
 };
 
+struct blkif_req {
+   int error;
+};
+
+static inline struct blkif_req *blkif_req(struct request *rq)
+{
+   return blk_mq_rq_to_pdu(rq);
+}
+
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
@@ -907,8 +916,14 @@ static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
+static void blkif_complete_rq(struct request *rq)
+{
+   blk_mq_end_request(rq, blkif_req(rq)->error);
+}
+
 static const struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq,
+   .complete = blkif_complete_rq,
 };
 
 static void blkif_set_queue_limits(struct blkfront_info *info)
@@ -969,7 +984,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-   info->tag_set.cmd_size = 0;
+   info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
 
if (blk_mq_alloc_tag_set(>tag_set))
@@ -1543,7 +1558,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
unsigned long flags;
struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
struct blkfront_info *info = rinfo->dev_info;
-   int error;
 
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return IRQ_HANDLED;
@@ -1587,37 +1601,36 @@ static irqreturn_t blkif_interrupt(int irq, void 
*dev_id)
continue;
}
 
-   error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+   blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : 
-EIO;
switch (bret->operation) {
case BLKIF_OP_DISCARD:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
struct request_queue *rq = info->rq;
printk(KERN_WARNING "blkfront: %s: %s op 
failed\n",
   info->gd->disk_name, 
op_name(bret->operation));
-   error = -EOPNOTSUPP;
+   blkif_req(req)->error = -EOPNOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
-   blk_mq_complete_request(req, error);
break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: %s op 
failed\n",
   info->gd->disk_name, 
op_name(bret->operation));
-   error = -EOPNOTSUPP;
+   blkif_req(req)->error = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
 rinfo->shadow[id].req.u.rw.nr_segments == 
0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op 
failed\n",
   info->gd->disk_name, 
op_name(bret->operation));
-   error = -EOPNOTSUPP;
+   blkif_req(req)->error = -EOPNOTSUPP;
}
-   if (unlikely(error)) {
-   if (error == -EOPNOTSUPP)
-   error = 0;
+   if (unlikely(blkif_req(req)->error)) {
+   if (blkif_req(req)->error == -EOPNOTSUPP)
+   blkif_req(req)->error = 0;
info->feature_fua = 0;
info->feature_flush = 0;

[PATCH 22/23] blktrace: remove the unused block_rq_abort tracepoint

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 include/trace/events/block.h | 44 ++--
 kernel/trace/blktrace.c  |  9 -
 2 files changed, 10 insertions(+), 43 deletions(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a88ed13446ff..99ed69fad041 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -61,7 +61,16 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
TP_ARGS(bh)
 );
 
-DECLARE_EVENT_CLASS(block_rq_with_error,
+/**
+ * block_rq_requeue - place block IO request back on a queue
+ * @q: queue holding operation
+ * @rq: block IO operation request
+ *
+ * The block operation request @rq is being placed back into queue
+ * @q.  For some reason the request was not completed and needs to be
+ * put back in the queue.
+ */
+TRACE_EVENT(block_rq_requeue,
 
TP_PROTO(struct request_queue *q, struct request *rq),
 
@@ -94,39 +103,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 );
 
 /**
- * block_rq_abort - abort block operation request
- * @q: queue containing the block operation request
- * @rq: block IO operation request
- *
- * Called immediately after pending block IO operation request @rq in
- * queue @q is aborted. The fields in the operation request @rq
- * can be examined to determine which device and sectors the pending
- * operation would access.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_abort,
-
-   TP_PROTO(struct request_queue *q, struct request *rq),
-
-   TP_ARGS(q, rq)
-);
-
-/**
- * block_rq_requeue - place block IO request back on a queue
- * @q: queue holding operation
- * @rq: block IO operation request
- *
- * The block operation request @rq is being placed back into queue
- * @q.  For some reason the request was not completed and needs to be
- * put back in the queue.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
-
-   TP_PROTO(struct request_queue *q, struct request *rq),
-
-   TP_ARGS(q, rq)
-);
-
-/**
  * block_rq_complete - block IO operation completed by device driver
  * @q: queue containing the block operation request
  * @rq: block operations request
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..9f3624dadb09 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -716,12 +716,6 @@ static void blk_add_trace_rq(struct request_queue *q, 
struct request *rq,
rq->cmd_flags, what, rq->errors, 0, NULL);
 }
 
-static void blk_add_trace_rq_abort(void *ignore,
-  struct request_queue *q, struct request *rq)
-{
-   blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
-}
-
 static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
 {
@@ -974,8 +968,6 @@ static void blk_register_tracepoints(void)
 {
int ret;
 
-   ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
-   WARN_ON(ret);
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1020,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
-   unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 
tracepoint_synchronize_unregister();
 }
-- 
2.11.0

[PATCH 17/23] blk-mq: simplify __blk_mq_complete_request

2017-04-19 Thread Christoph Hellwig

Merge blk_mq_ipi_complete_request and blk_mq_stat_add into their only
caller.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq.c | 25 -
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0c19df66f5a0..1f98db082da3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -406,12 +406,19 @@ static void __blk_mq_complete_request_remote(void *data)
rq->q->softirq_done_fn(rq);
 }
 
-static void blk_mq_ipi_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
 {
struct blk_mq_ctx *ctx = rq->mq_ctx;
bool shared = false;
int cpu;
 
+   if (rq->internal_tag != -1)
+   blk_mq_sched_completed_request(rq);
+   if (rq->rq_flags & RQF_STATS) {
+   blk_mq_poll_stats_start(rq->q);
+   blk_stat_add(rq);
+   }
+
if (!test_bit(QUEUE_FLAG_SAME_COMP, >q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
@@ -432,22 +439,6 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
 }
 
-static void blk_mq_stat_add(struct request *rq)
-{
-   if (rq->rq_flags & RQF_STATS) {
-   blk_mq_poll_stats_start(rq->q);
-   blk_stat_add(rq);
-   }
-}
-
-static void __blk_mq_complete_request(struct request *rq)
-{
-   if (rq->internal_tag != -1)
-   blk_mq_sched_completed_request(rq);
-   blk_mq_stat_add(rq);
-   blk_mq_ipi_complete_request(rq);
-}
-
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:the request being processed
-- 
2.11.0

[PATCH 20/23] ataflop: switch from req->errors to req->error_count

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 drivers/block/ataflop.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 2104b1b4ccda..fa69ecd52cb5 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -617,12 +617,12 @@ static void fd_error( void )
if (!fd_request)
return;
 
-   fd_request->errors++;
-   if (fd_request->errors >= MAX_ERRORS) {
+   fd_request->error_count++;
+   if (fd_request->error_count >= MAX_ERRORS) {
printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
fd_end_request_cur(-EIO);
}
-   else if (fd_request->errors == RECALIBRATE_ERRORS) {
+   else if (fd_request->error_count == RECALIBRATE_ERRORS) {
printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
if (SelectedDrive != -1)
SUD.track = -1;
@@ -1386,7 +1386,7 @@ static void setup_req_params( int drive )
ReqData = ReqBuffer + 512 * ReqCnt;
 
if (UseTrackbuffer)
-   read_track = (ReqCmd == READ && fd_request->errors == 0);
+   read_track = (ReqCmd == READ && fd_request->error_count == 0);
else
read_track = 0;
 
@@ -1409,8 +1409,10 @@ static struct request *set_next_request(void)
fdc_queue = 0;
if (q) {
rq = blk_fetch_request(q);
-   if (rq)
+   if (rq) {
+   rq->error_count = 0;
break;
+   }
}
} while (fdc_queue != old_pos);
 
-- 
2.11.0

[PATCH 08/23] scsi: introduce a result field in struct scsi_request

2017-04-19 Thread Christoph Hellwig

This passes on the scsi_cmnd result field to users of passthrough
requests.  Currently we abuse req->errors for this purpose, but that
field will go away in its current form.

Note that the old IDE code abuses the errors field in very creative
ways and stores all kinds of different values in it.  I didn't dare
to touch this magic, so the abuses are brought forward 1:1.

Signed-off-by: Christoph Hellwig 
---
 block/bsg-lib.c|  8 
 block/bsg.c| 12 +--
 block/scsi_ioctl.c | 14 ++---
 drivers/block/cciss.c  | 42 +++---
 drivers/block/pktcdvd.c|  2 +-
 drivers/block/virtio_blk.c |  2 +-
 drivers/cdrom/cdrom.c  |  2 +-
 drivers/ide/ide-atapi.c| 10 -
 drivers/ide/ide-cd.c   | 20 +-
 drivers/ide/ide-cd_ioctl.c |  2 +-
 drivers/ide/ide-devsets.c  |  4 ++--
 drivers/ide/ide-dma.c  |  2 +-
 drivers/ide/ide-eh.c   | 36 
 drivers/ide/ide-floppy.c   | 10 -
 drivers/ide/ide-io.c   | 10 -
 drivers/ide/ide-ioctls.c   |  4 ++--
 drivers/ide/ide-park.c |  2 +-
 drivers/ide/ide-pm.c   |  8 
 drivers/ide/ide-tape.c |  4 ++--
 drivers/ide/ide-taskfile.c |  6 +++---
 drivers/scsi/osd/osd_initiator.c   |  2 +-
 drivers/scsi/osst.c|  2 +-
 drivers/scsi/qla2xxx/qla_bsg.c |  6 +++---
 drivers/scsi/scsi_lib.c| 15 +++---
 drivers/scsi/scsi_transport_sas.c  |  2 +-
 drivers/scsi/sg.c  |  2 +-
 drivers/scsi/st.c  |  6 +++---
 drivers/target/target_core_pscsi.c |  2 +-
 fs/nfsd/blocklayout.c  |  4 ++--
 include/linux/ide.h|  2 +-
 include/scsi/scsi_request.h|  1 +
 31 files changed, 122 insertions(+), 122 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb147..0a23dbba2d30 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
struct bsg_job *job = container_of(kref, struct bsg_job, kref);
struct request *rq = job->req;
 
-   blk_end_request_all(rq, rq->errors);
+   blk_end_request_all(rq, scsi_req(rq)->result);
 
put_device(job->dev);   /* release reference for the request */
 
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
struct scsi_request *rq = scsi_req(req);
int err;
 
-   err = job->req->errors = result;
+   err = scsi_req(job->req)->result = result;
if (err < 0)
/* we're only returning the result field in the reply */
rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ static int bsg_create_job(struct device *dev, struct 
request *req)
  * @q: request queue to manage
  *
  * On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
 
ret = bsg_create_job(dev, req);
if (ret) {
-   req->errors = ret;
+   scsi_req(req)->result = ret;
blk_end_request_all(req, ret);
spin_lock_irq(q->queue_lock);
continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..d9da1b613ced 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, 
struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq);
int ret = 0;
 
-   dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+   dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
/*
 * fill in all the output members
 */
-   hdr->device_status = rq->errors & 0xff;
-   hdr->transport_status = host_byte(rq->errors);
-   hdr->driver_status = driver_byte(rq->errors);
+   hdr->device_status = req->result & 0xff;
+   hdr->transport_status = host_byte(req->result);
+   hdr->driver_status = driver_byte(req->result);
hdr->info = 0;
if (hdr->device_status || hdr->transport_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, 
struct sg_io_v4 *hdr,
 * just a protocol response (i.e. non negative), that gets
 * processed above.
 */
-   if (!ret && rq->errors < 0)
-   ret = rq->errors;
+   if (!ret && req->result < 0)
+   ret = req->result;
 
blk_rq_unmap_user(bio);
scsi_req_free_cmd(req);
diff --git a/block/scsi_ioctl.c

[PATCH 06/23] virtio: fix spelling of virtblk_scsi_request_done

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
Reviewed-by: Johannes Thumshirn 
Reviewed-by: Bart Van Assche 
---
 drivers/block/virtio_blk.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index eaf99022bdc6..dbc4e80680b1 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -111,7 +111,7 @@ static int virtblk_add_req_scsi(struct virtqueue *vq, 
struct virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
 {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
struct virtio_blk *vblk = req->q->queuedata;
@@ -144,7 +144,7 @@ static inline int virtblk_add_req_scsi(struct virtqueue *vq,
 {
return -EIO;
 }
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
 {
 }
 #define virtblk_ioctl  NULL
@@ -180,7 +180,7 @@ static inline void virtblk_request_done(struct request *req)
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
-   virtblk_scsi_reques_done(req);
+   virtblk_scsi_request_done(req);
break;
case REQ_OP_DRV_IN:
req->errors = (error != 0);
-- 
2.11.0

[PATCH 07/23] virtio_blk: don't use req->errors

2017-04-19 Thread Christoph Hellwig

Remove passing req->errors (which at that point is always 0) to
blk_mq_complete_request, and rely on the virtio status code for the
serial number passthrough request.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Johannes Thumshirn 
---
 drivers/block/virtio_blk.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dbc4e80680b1..8378ad480f77 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -175,19 +175,15 @@ static int virtblk_add_req(struct virtqueue *vq, struct 
virtblk_req *vbr,
 static inline void virtblk_request_done(struct request *req)
 {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
-   int error = virtblk_result(vbr);
 
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
virtblk_scsi_request_done(req);
break;
-   case REQ_OP_DRV_IN:
-   req->errors = (error != 0);
-   break;
}
 
-   blk_mq_end_request(req, error);
+   blk_mq_end_request(req, virtblk_result(vbr));
 }
 
 static void virtblk_done(struct virtqueue *vq)
@@ -205,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, )) != 
NULL) {
struct request *req = blk_mq_rq_from_pdu(vbr);
 
-   blk_mq_complete_request(req, req->errors);
+   blk_mq_complete_request(req, 0);
req_done = true;
}
if (unlikely(virtqueue_is_broken(vq)))
@@ -311,7 +307,7 @@ static int virtblk_get_id(struct gendisk *disk, char 
*id_str)
goto out;
 
blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
-   err = req->errors ? -EIO : 0;
+   err = virtblk_result(blk_mq_rq_to_pdu(req));
 out:
blk_put_request(req);
return err;
-- 
2.11.0

[PATCH 10/23] null_blk: don't pass always-0 req->errors to blk_mq_complete_request

2017-04-19 Thread Christoph Hellwig

Signed-off-by: Christoph Hellwig 
---
 drivers/block/null_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index f93906ff31e8..24ca85a70fd8 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -281,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
case NULL_IRQ_SOFTIRQ:
switch (queue_mode)  {
case NULL_Q_MQ:
-   blk_mq_complete_request(cmd->rq, cmd->rq->errors);
+   blk_mq_complete_request(cmd->rq, 0);
break;
case NULL_Q_RQ:
blk_complete_request(cmd->rq);
-- 
2.11.0

[PATCH 14/23] mtip32xx: add a status field to struct mtip_cmd

2017-04-19 Thread Christoph Hellwig

Instead of using req->errors, which will go away.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/mtip32xx/mtip32xx.c | 16 +---
 drivers/block/mtip32xx/mtip32xx.h |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 05e3e664ea1b..7406de29db58 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -241,7 +241,8 @@ static void mtip_async_complete(struct mtip_port *port,
 
rq = mtip_rq_from_tag(dd, tag);
 
-   blk_mq_complete_request(rq, status);
+   cmd->status = status;
+   blk_mq_complete_request(rq, 0);
 }
 
 /*
@@ -2910,18 +2911,19 @@ static void mtip_softirq_done_fn(struct request *rq)
if (unlikely(cmd->unaligned))
up(>port->cmd_slot_unal);
 
-   blk_mq_end_request(rq, rq->errors);
+   blk_mq_end_request(rq, cmd->status);
 }
 
 static void mtip_abort_cmd(struct request *req, void *data,
bool reserved)
 {
+   struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
struct driver_data *dd = data;
 
dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
 
clear_bit(req->tag, dd->port->cmds_to_issue);
-   req->errors = -EIO;
+   cmd->status = -EIO;
mtip_softirq_done_fn(req);
 }
 
@@ -3816,7 +3818,6 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
if (likely(!ret))
return BLK_MQ_RQ_QUEUE_OK;
 
-   rq->errors = ret;
return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
@@ -4106,9 +4107,10 @@ static void mtip_no_dev_cleanup(struct request *rq, void 
*data, bool reserv)
struct driver_data *dd = (struct driver_data *)data;
struct mtip_cmd *cmd;
 
-   if (likely(!reserv))
-   blk_mq_complete_request(rq, -ENODEV);
-   else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, >port->flags)) {
+   if (likely(!reserv)) {
+   cmd->status = -ENODEV;
+   blk_mq_complete_request(rq, 0);
+   } else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, >port->flags)) {
 
cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
if (cmd->comp_func)
diff --git a/drivers/block/mtip32xx/mtip32xx.h 
b/drivers/block/mtip32xx/mtip32xx.h
index 7617888f7944..57b41528a824 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -352,6 +352,7 @@ struct mtip_cmd {
int retries; /* The number of retries left for this command. */
 
int direction; /* Data transfer direction */
+   int status;
 };
 
 /* Structure used to describe a port. */
-- 
2.11.0

[PATCH 09/23] loop: zero-fill bio on the submitting cpu

2017-04-19 Thread Christoph Hellwig

In thruth I've just audited which blk-mq drivers don't currently have a
complete callback, but I think this change is at least borderline useful.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Ming Lei 
---
 drivers/block/loop.c | 30 ++
 drivers/block/loop.h |  1 +
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3081d83d2ea3..86351b3f7350 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -445,32 +445,27 @@ static int lo_req_flush(struct loop_device *lo, struct 
request *rq)
return ret;
 }
 
-static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+static void lo_complete_rq(struct request *rq)
 {
-   if (bytes < 0 || op_is_write(req_op(cmd->rq)))
-   return;
+   struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
-   if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+   if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio &&
+cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) {
struct bio *bio = cmd->rq->bio;
 
-   bio_advance(bio, bytes);
+   bio_advance(bio, cmd->ret);
zero_fill_bio(bio);
}
+
+   blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0);
 }
 
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
-   struct request *rq = cmd->rq;
-
-   handle_partial_read(cmd, ret);
 
-   if (ret > 0)
-   ret = 0;
-   else if (ret < 0)
-   ret = -EIO;
-
-   blk_mq_complete_request(rq, ret);
+   cmd->ret = ret;
+   blk_mq_complete_request(cmd->rq, 0);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -1688,8 +1683,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
ret = do_req_filebacked(lo, cmd->rq);
  failed:
/* complete non-aio request */
-   if (!cmd->use_aio || ret)
-   blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+   if (!cmd->use_aio || ret) {
+   cmd->ret = ret ? -EIO : 0;
+   blk_mq_complete_request(cmd->rq, 0);
+   }
 }
 
 static void loop_queue_work(struct kthread_work *work)
@@ -1715,6 +1712,7 @@ static int loop_init_request(void *data, struct request 
*rq,
 static const struct blk_mq_ops loop_mq_ops = {
.queue_rq   = loop_queue_rq,
.init_request   = loop_init_request,
+   .complete   = lo_complete_rq,
 };
 
 static int loop_add(struct loop_device **l, int i)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fb2237c73e61..fecd3f97ef8c 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -70,6 +70,7 @@ struct loop_cmd {
struct request *rq;
struct list_head list;
bool use_aio;   /* use AIO interface to handle I/O */
+   long ret;
struct kiocb iocb;
 };
 
-- 
2.11.0

[PATCH 11/23] dm rq: don't pass irrelevant error code to blk_mq_complete_request

2017-04-19 Thread Christoph Hellwig

dm never uses rq->errors, so there is no need to pass an error argument
to blk_mq_complete_request.

Signed-off-by: Christoph Hellwig 
---
 drivers/md/dm-rq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index e60f1b6845be..1173be21f6f6 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -363,7 +363,7 @@ static void dm_complete_request(struct request *rq, int 
error)
if (!rq->q->mq_ops)
blk_complete_request(rq);
else
-   blk_mq_complete_request(rq, error);
+   blk_mq_complete_request(rq, 0);
 }
 
 /*
-- 
2.11.0

[PATCH 13/23] nbd: don't use req->errors

2017-04-19 Thread Christoph Hellwig

Add a nbd-specific field instead.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Josef Bacik 
---
 drivers/block/nbd.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6e592c2ba8fd..09a74a66beb1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -115,6 +115,7 @@ struct nbd_cmd {
int index;
int cookie;
struct completion send_complete;
+   int status;
 };
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -244,16 +245,14 @@ static void nbd_size_set(struct nbd_device *nbd, loff_t 
blocksize,
nbd_size_update(nbd);
 }
 
-static void nbd_end_request(struct nbd_cmd *cmd)
+static void nbd_complete_rq(struct request *req)
 {
-   struct nbd_device *nbd = cmd->nbd;
-   struct request *req = blk_mq_rq_from_pdu(cmd);
-   int error = req->errors ? -EIO : 0;
+   struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
-   dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
-   error ? "failed" : "done");
+   dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", cmd,
+   cmd->status ? "failed" : "done");
 
-   blk_mq_complete_request(req, error);
+   blk_mq_end_request(req, cmd->status);
 }
 
 /*
@@ -286,7 +285,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct 
request *req,
struct nbd_config *config;
 
if (!refcount_inc_not_zero(>config_refs)) {
-   req->errors = -EIO;
+   cmd->status = -EIO;
return BLK_EH_HANDLED;
}
 
@@ -331,7 +330,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct 
request *req,
"Connection timed out\n");
}
set_bit(NBD_TIMEDOUT, >runtime_flags);
-   req->errors = -EIO;
+   cmd->status = -EIO;
sock_shutdown(nbd);
nbd_config_put(nbd);
 
@@ -574,7 +573,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device 
*nbd, int index)
if (ntohl(reply.error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error 
(%d)\n",
ntohl(reply.error));
-   req->errors = -EIO;
+   cmd->status = -EIO;
return cmd;
}
 
@@ -599,7 +598,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device 
*nbd, int index)
 */
if (nbd_disconnected(config) ||
config->num_connections <= 1) {
-   req->errors = -EIO;
+   cmd->status = -EIO;
return cmd;
}
return ERR_PTR(-EIO);
@@ -636,7 +635,7 @@ static void recv_work(struct work_struct *work)
break;
}
 
-   nbd_end_request(cmd);
+   blk_mq_complete_request(blk_mq_rq_from_pdu(cmd), 0);
}
atomic_dec(>recv_threads);
wake_up(>recv_wq);
@@ -651,8 +650,8 @@ static void nbd_clear_req(struct request *req, void *data, 
bool reserved)
if (!blk_mq_request_started(req))
return;
cmd = blk_mq_rq_to_pdu(req);
-   req->errors = -EIO;
-   nbd_end_request(cmd);
+   cmd->status = -EIO;
+   blk_mq_complete_request(req, 0);
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
@@ -740,7 +739,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
nbd_config_put(nbd);
return -EINVAL;
}
-   req->errors = 0;
+   cmd->status = 0;
 again:
nsock = config->socks[index];
mutex_lock(>tx_lock);
@@ -1408,6 +1407,7 @@ static int nbd_init_request(void *data, struct request 
*rq,
 
 static const struct blk_mq_ops nbd_mq_ops = {
.queue_rq   = nbd_queue_rq,
+   .complete   = nbd_complete_rq,
.init_request   = nbd_init_request,
.timeout= nbd_xmit_timeout,
 };
-- 
2.11.0

[PATCH 01/23] pd: don't check blk_execute_rq return value.

2017-04-19 Thread Christoph Hellwig

The driver never sets req->errors, so blk_execute_rq will always return 0.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/paride/pd.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index b05e151c9b38..7d2402f90978 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -739,18 +739,15 @@ static int pd_special_command(struct pd_unit *disk,
  enum action (*func)(struct pd_unit *disk))
 {
struct request *rq;
-   int err = 0;
 
rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
 
rq->special = func;
-
-   err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
-
+   blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
blk_put_request(rq);
-   return err;
+   return 0;
 }
 
 /* kernel glue structures */
-- 
2.11.0

[PATCH 05/23] nvme: make nvme_error_status private

2017-04-19 Thread Christoph Hellwig

Currently it's used by the lighnvm passthrough ioctl, but we'd like to make
it private in preparation of block layer specific error code.  Lighnvm already
returns the real NVMe status anyway, so I think we can just limit it to
returning -EIO for any status set.

This will need a careful audit from the lightnvm folks, though.

Signed-off-by: Christoph Hellwig 
---
 drivers/nvme/host/core.c | 3 +--
 drivers/nvme/host/lightnvm.c | 6 +++---
 drivers/nvme/host/nvme.h | 1 -
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c6f256d74b6b..805f250315ec 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -66,7 +66,7 @@ static DEFINE_SPINLOCK(dev_list_lock);
 
 static struct class *nvme_class;
 
-int nvme_error_status(struct request *req)
+static int nvme_error_status(struct request *req)
 {
switch (nvme_req(req)->status & 0x7ff) {
case NVME_SC_SUCCESS:
@@ -77,7 +77,6 @@ int nvme_error_status(struct request *req)
return -EIO;
}
 }
-EXPORT_SYMBOL_GPL(nvme_error_status);
 
 static inline bool nvme_req_needs_retry(struct request *req)
 {
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index f85e6e57d641..7f407d5f0095 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -599,7 +599,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
__le64 *metadata = NULL;
dma_addr_t metadata_dma;
DECLARE_COMPLETION_ONSTACK(wait);
-   int ret;
+   int ret = 0;
 
rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
NVME_QID_ANY);
@@ -671,8 +671,8 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
 
if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
-   else
-   ret = nvme_error_status(rq);
+   else if (nvme_req(rq)->status & 0x7ff)
+   ret = -EIO;
if (result)
*result = nvme_req(rq)->status & 0x7ff;
if (status)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d7330f75632d..550037f5efea 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -254,7 +254,6 @@ static inline void nvme_end_request(struct request *req, 
__le16 status,
blk_mq_complete_request(req, 0);
 }
 
-int nvme_error_status(struct request *req);
 void nvme_complete_rq(struct request *req);
 void nvme_cancel_request(struct request *req, void *data, bool reserved);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
-- 
2.11.0

[PATCH 04/23] nvme: split nvme status from block req->errors

2017-04-19 Thread Christoph Hellwig

We want our own clearly defined error field for NVMe passthrough commands,
and the request errors field is going away in its current form.

Just store the status and result field in the nvme_request field from
hardirq completion context (using a new helper) and then generate a
Linux errno for the block layer only when we actually need it.

Because we can't overload the status value with a negative error code
for cancelled command we now have a flags filed in struct nvme_request
that contains a bit for this condition.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Johannes Thumshirn 
---
 drivers/nvme/host/core.c | 50 +++-
 drivers/nvme/host/fc.c   | 10 -
 drivers/nvme/host/lightnvm.c |  9 +---
 drivers/nvme/host/nvme.h | 33 +
 drivers/nvme/host/pci.c  | 11 +-
 drivers/nvme/host/rdma.c |  5 ++---
 drivers/nvme/target/loop.c   |  7 ++-
 7 files changed, 65 insertions(+), 60 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d33f829c3ab7..c6f256d74b6b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -66,11 +66,24 @@ static DEFINE_SPINLOCK(dev_list_lock);
 
 static struct class *nvme_class;
 
+int nvme_error_status(struct request *req)
+{
+   switch (nvme_req(req)->status & 0x7ff) {
+   case NVME_SC_SUCCESS:
+   return 0;
+   case NVME_SC_CAP_EXCEEDED:
+   return -ENOSPC;
+   default:
+   return -EIO;
+   }
+}
+EXPORT_SYMBOL_GPL(nvme_error_status);
+
 static inline bool nvme_req_needs_retry(struct request *req)
 {
if (blk_noretry_request(req))
return false;
-   if (req->errors & NVME_SC_DNR)
+   if (nvme_req(req)->status & NVME_SC_DNR)
return false;
if (jiffies - req->start_time >= req->timeout)
return false;
@@ -81,23 +94,13 @@ static inline bool nvme_req_needs_retry(struct request *req)
 
 void nvme_complete_rq(struct request *req)
 {
-   int error = 0;
-
-   if (unlikely(req->errors)) {
-   if (nvme_req_needs_retry(req)) {
-   nvme_req(req)->retries++;
-   blk_mq_requeue_request(req,
-   !blk_mq_queue_stopped(req->q));
-   return;
-   }
-
-   if (blk_rq_is_passthrough(req))
-   error = req->errors;
-   else
-   error = nvme_error_status(req->errors);
+   if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+   nvme_req(req)->retries++;
+   blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+   return;
}
 
-   blk_mq_end_request(req, error);
+   blk_mq_end_request(req, nvme_error_status(req));
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
@@ -114,7 +117,9 @@ void nvme_cancel_request(struct request *req, void *data, 
bool reserved)
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
-   blk_mq_complete_request(req, status);
+   nvme_req(req)->status = status;
+   blk_mq_complete_request(req, 0);
+
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
@@ -357,6 +362,7 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 
if (!(req->rq_flags & RQF_DONTPREP)) {
nvme_req(req)->retries = 0;
+   nvme_req(req)->flags = 0;
req->rq_flags |= RQF_DONTPREP;
}
 
@@ -413,7 +419,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct 
nvme_command *cmd,
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
-   ret = req->errors;
+   if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+   ret = -EINTR;
+   else
+   ret = nvme_req(req)->status;
  out:
blk_mq_free_request(req);
return ret;
@@ -498,7 +507,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct 
nvme_command *cmd,
}
  submit:
blk_execute_rq(req->q, disk, req, 0);
-   ret = req->errors;
+   if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+   ret = -EINTR;
+   else
+   ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aad7f9c0be32..450733c8cd24 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1148,6 +1148,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_fc_queue *queue = op->queue;
struct nvme_completion *cqe = >rsp_iu.cqe;
__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
+   union nvme_result result;
 
/*
 *

[PATCH 02/23] block: remove the blk_execute_rq return value

2017-04-19 Thread Christoph Hellwig

The function only returns -EIO if rq->errors is non-zero, which is not
very useful and lets a large number of callers ignore the return value.

Just let the callers figure out their error themselves.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Johannes Thumshirn 
---
 block/blk-exec.c | 8 +---
 block/scsi_ioctl.c   | 3 ++-
 drivers/block/virtio_blk.c   | 3 ++-
 drivers/cdrom/cdrom.c| 3 ++-
 drivers/ide/ide-atapi.c  | 3 ++-
 drivers/ide/ide-cd.c | 3 ++-
 drivers/ide/ide-cd_ioctl.c   | 3 ++-
 drivers/ide/ide-devsets.c| 4 ++--
 drivers/ide/ide-disk.c   | 3 +--
 drivers/ide/ide-ioctls.c | 7 ---
 drivers/ide/ide-park.c   | 3 ++-
 drivers/ide/ide-pm.c | 3 ++-
 drivers/ide/ide-taskfile.c   | 4 ++--
 drivers/scsi/osd/osd_initiator.c | 5 -
 fs/nfsd/blocklayout.c| 5 +++--
 include/linux/blkdev.h   | 2 +-
 16 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc8..afa383248c7c 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -92,11 +92,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *Insert a fully prepared request at the back of the I/O scheduler queue
  *for execution and wait for completion.
  */
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
   struct request *rq, int at_head)
 {
DECLARE_COMPLETION_ONSTACK(wait);
-   int err = 0;
unsigned long hang_check;
 
rq->end_io_data = 
@@ -108,10 +107,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk 
*bd_disk,
while (!wait_for_completion_io_timeout(, hang_check * 
(HZ/2)));
else
wait_for_completion_io();
-
-   if (rq->errors)
-   err = -EIO;
-
-   return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a43bb19967..b1352143f12f 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, 
struct gendisk *bd_disk,
scsi_req(rq)->cmd[0] = cmd;
scsi_req(rq)->cmd[4] = data;
scsi_req(rq)->cmd_len = 6;
-   err = blk_execute_rq(q, bd_disk, rq, 0);
+   blk_execute_rq(q, bd_disk, rq, 0);
+   err = rq->errors ? -EIO : 0;
blk_put_request(rq);
 
return err;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2d8290169271..eaf99022bdc6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -310,7 +310,8 @@ static int virtblk_get_id(struct gendisk *disk, char 
*id_str)
if (err)
goto out;
 
-   err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+   blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+   err = req->errors ? -EIO : 0;
 out:
blk_put_request(req);
return err;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 87739649eac2..308501730ab3 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info 
*cdi, __u8 __user *ubuf,
rq->timeout = 60 * HZ;
bio = rq->bio;
 
-   if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+   blk_execute_rq(q, cdi->disk, rq, 0);
+   if (rq->errors) {
struct request_sense *s = req->sense;
ret = -EIO;
cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index feb30061123b..1524797e1776 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk 
*disk,
memcpy(scsi_req(rq)->cmd, pc->c, 12);
if (drive->media == ide_tape)
scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-   error = blk_execute_rq(drive->queue, disk, rq, 0);
+   blk_execute_rq(drive->queue, disk, rq, 0);
+   error = rq->errors ? -EIO : 0;
 put_req:
blk_put_request(rq);
return error;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 74f1b7dc03f7..95c40afa9120 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char 
*cmd,
}
}
 
-   error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+   blk_execute_rq(drive->queue, info->disk, rq, 0);
+   error = rq->errors ? -EIO : 0;
 
if (buffer)
*bufflen = scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 9fcefbc8425e..f1ab726bd430 100644

kill req->errors V3

2017-04-19 Thread Christoph Hellwig

Currently the request structure has an errors field that is used in
various different ways.  The oldest drivers use it as an error count,
blk-mq and the generic timeout code assume that it holds a Linux
errno for block completions, and various drivers use it for internal
status values, often overwriting them with Linux errnos later,
that is unless they are processing passthrough requests in which
case they'll leave their errors in it.

This series kills the ->errors field and replaced it with new fields
in the drivers (or an odd hack in a union in struct request for
two bitrotting old drivers).

Also available in the following git tree:

git://git.infradead.org/users/hch/block.git request-errors

Gitweb:


http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/request-errors

Changes since V2;
 - reorder one patch to be earlier in the series
 - fix the argument to a bsg dprintk
 - fix a kerneldoc comment

Changes since V1:
 - rebased on top the latest block for-next tree
 - fix error handling in nfsd blocklayout
 - dropped "scsi: fix fast-fail for non-passthrough requests"

kill req->errors V3

2017-04-19 Thread Christoph Hellwig

Currently the request structure has an errors field that is used in
various different ways.  The oldest drivers use it as an error count,
blk-mq and the generic timeout code assume that it holds a Linux
errno for block completions, and various drivers use it for internal
status values, often overwriting them with Linux errnos later,
that is unless they are processing passthrough requests in which
case they'll leave their errors in it.

This series kills the ->errors field and replaced it with new fields
in the drivers (or an odd hack in a union in struct request for
two bitrotting old drivers).

Also available in the following git tree:

git://git.infradead.org/users/hch/block.git request-errors

Gitweb:


http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/request-errors

Changes since V2;
 - reorder one patch to be earlier in the series
 - fix the argument to a bsg dprintk
 - fix a kerneldoc comment

Changes since V1:
 - rebased on top the latest block for-next tree
 - fix error handling in nfsd blocklayout
 - dropped "scsi: fix fast-fail for non-passthrough requests"

Re: [PATCH V4 00/16] Introduce the BFQ I/O scheduler

2017-04-19 Thread Paolo Valente


> Il giorno 19 apr 2017, alle ore 16:33, Jens Axboe  ha 
> scritto:
> 
> On 04/19/2017 03:23 AM, Paolo Valente wrote:
>> 
>>> Il giorno 12 apr 2017, alle ore 18:23, Paolo Valente 
>>>  ha scritto:
>>> 
>>> Hi,
>>> new patch series, addressing (both) issues raised by Bart [1], and
>>> with block/Makefile fixed as suggested by Bart [2].
>>> 
>> 
>> Hi Jens,
>> apparently no complain of any sort on this last series.  Do you think
>> we could make it for 4.12, or shall we aim at 4.13?
> 
> We may as well queue it up now, I don't think there's much point in
> deferring an extra cycle.
> 

A little scary, but (of course) fine for me.

Thanks,
Paolo

> -- 
> Jens Axboe
>

Re: [PATCH] lightnvm: assume 64-bit lba numbers

2017-04-19 Thread Matias Bjørling


On 04/19/2017 07:39 PM, Arnd Bergmann wrote:

The driver uses both u64 and sector_t to refer to offsets, and assigns between 
the
two. This causes one harmless warning when sector_t is 32-bit:

drivers/lightnvm/pblk-rb.c: In function 'pblk_rb_write_entry_gc':
include/linux/lightnvm.h:215:20: error: large integer implicitly truncated to 
unsigned type [-Werror=overflow]
drivers/lightnvm/pblk-rb.c:324:22: note: in expansion of macro 'ADDR_EMPTY'

As the driver is already doing this inconsistently, changing the type
won't make it worse and is an easy way to avoid the warning.

Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Arnd Bergmann 
---
 drivers/lightnvm/pblk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index c82120ce3be5..11ed7d83f572 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -119,7 +119,7 @@ struct pblk_w_ctx {
struct bio_list bios;   /* Original bios - used for completion
 * in REQ_FUA, REQ_FLUSH case
 */
-   sector_t lba;   /* Logic addr. associated with entry */
+   u64 lba;/* Logic addr. associated with entry */
struct ppa_addr ppa;/* Physic addr. associated with entry */
int flags;  /* Write context flags */
 };



Thanks Arnd. Jens, could you pick this up? Thank you

Re: blk-mq: remove the error argument to blk_mq_complete_request

2017-04-19 Thread h...@lst.de

On Tue, Apr 18, 2017 at 10:50:18PM +, Bart Van Assche wrote:
> On Tue, 2017-04-18 at 08:52 -0700, Christoph Hellwig wrote:
> > Now that we always have a ->complete callback we can remove the direct
> > call to blk_mq_end_request, as well as the error argument to
> > blk_mq_complete_request.
> 
> Hello Christoph,
> 
> Please add a runtime check that issues a warning early if a .complete
> callback function is missing.
> 
> Are you sure that all blk-mq drivers have a .complete callback? In the
> request-errors branch of your block git tree I found the following:

I think the problem is that my above changelog was wrong - we only
require the complete_rq method if the driver calls the
blk_mq_complete_request function.  Both rbd and ubiblock don't in
the tree.  They probably should, but that's work for a separate
series.

[PATCH] lightnvm: assume 64-bit lba numbers

2017-04-19 Thread Arnd Bergmann

The driver uses both u64 and sector_t to refer to offsets, and assigns between 
the
two. This causes one harmless warning when sector_t is 32-bit:

drivers/lightnvm/pblk-rb.c: In function 'pblk_rb_write_entry_gc':
include/linux/lightnvm.h:215:20: error: large integer implicitly truncated to 
unsigned type [-Werror=overflow]
drivers/lightnvm/pblk-rb.c:324:22: note: in expansion of macro 'ADDR_EMPTY'

As the driver is already doing this inconsistently, changing the type
won't make it worse and is an easy way to avoid the warning.

Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target")
Signed-off-by: Arnd Bergmann 
---
 drivers/lightnvm/pblk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index c82120ce3be5..11ed7d83f572 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -119,7 +119,7 @@ struct pblk_w_ctx {
struct bio_list bios;   /* Original bios - used for completion
 * in REQ_FUA, REQ_FLUSH case
 */
-   sector_t lba;   /* Logic addr. associated with entry */
+   u64 lba;/* Logic addr. associated with entry */
struct ppa_addr ppa;/* Physic addr. associated with entry */
int flags;  /* Write context flags */
 };
-- 
2.9.0

Re: clean up a few end_request variants

2017-04-19 Thread Jens Axboe

On 04/12/2017 04:13 AM, Christoph Hellwig wrote:
> Just some trivial cleanup patches that I'd like to offload from
> a big series in the pipe..

Added for 4.12, thanks.

-- 
Jens Axboe

Re: [PATCH 4/5] block: Inline blk_rq_set_prio()

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 08:38 -0600, Jens Axboe wrote:
> On 04/19/2017 12:20 AM, Christoph Hellwig wrote:
> > > + req->ioprio = ioprio_valid(bio_prio(bio)) ? bio_prio(bio) : ioc ?
> > > + ioc->ioprio : IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
> > 
> > I think this would be a tad cleaner with a traditional if / else if / else
> > chain, e.g.
> > 
> > if (ioprio_valid(bio_prio(bio)))
> > req->ioprio = bio_prio(bio);
> > else if (ioc)
> > req->ioprio = ioc->ioprio;
> > else
> > req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
> 
> Agree, I hate ternaries with a vengeance, and a doubly nested one is
> almost a shooting offense.
> 
> Bart, care to respin with this fixed and the GPL export modification?

Hello Christoph and Jens,

Thanks for the review and the feedback. I will make the proposed changes
and repost this patch series.

Bart.

Re: [resend PATCH v2 08/33] dcssblk: add dax_operations support

2017-04-19 Thread Dan Williams

On Wed, Apr 19, 2017 at 8:31 AM, Gerald Schaefer
 wrote:
> On Mon, 17 Apr 2017 12:09:32 -0700
> Dan Williams  wrote:
>
>> Setup a dax_dev to have the same lifetime as the dcssblk block device
>> and add a ->direct_access() method that is equivalent to
>> dcssblk_direct_access(). Once fs/dax.c has been converted to use
>> dax_operations the old dcssblk_direct_access() will be removed.
>>
>> Cc: Gerald Schaefer 
>> Signed-off-by: Dan Williams 
>> ---
>>  drivers/s390/block/Kconfig   |1 +
>>  drivers/s390/block/dcssblk.c |   54 
>> +++---
>>  2 files changed, 46 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
>> index 4a3b62326183..0acb8c2f9475 100644
>> --- a/drivers/s390/block/Kconfig
>> +++ b/drivers/s390/block/Kconfig
>> @@ -14,6 +14,7 @@ config BLK_DEV_XPRAM
>>
>>  config DCSSBLK
>>   def_tristate m
>> + select DAX
>>   prompt "DCSSBLK support"
>>   depends on S390 && BLOCK
>>   help
>> diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
>> index 415d10a67b7a..682a9eb4934d 100644
>> --- a/drivers/s390/block/dcssblk.c
>> +++ b/drivers/s390/block/dcssblk.c
>> @@ -18,6 +18,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>
>> @@ -30,8 +31,10 @@ static int dcssblk_open(struct block_device *bdev, 
>> fmode_t mode);
>>  static void dcssblk_release(struct gendisk *disk, fmode_t mode);
>>  static blk_qc_t dcssblk_make_request(struct request_queue *q,
>>   struct bio *bio);
>> -static long dcssblk_direct_access(struct block_device *bdev, sector_t 
>> secnum,
>> +static long dcssblk_blk_direct_access(struct block_device *bdev, sector_t 
>> secnum,
>>void **kaddr, pfn_t *pfn, long size);
>> +static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t 
>> pgoff,
>> + long nr_pages, void **kaddr, pfn_t *pfn);
>>
>>  static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
>>
>> @@ -40,7 +43,11 @@ static const struct block_device_operations 
>> dcssblk_devops = {
>>   .owner  = THIS_MODULE,
>>   .open   = dcssblk_open,
>>   .release= dcssblk_release,
>> - .direct_access  = dcssblk_direct_access,
>> + .direct_access  = dcssblk_blk_direct_access,
>> +};
>> +
>> +static const struct dax_operations dcssblk_dax_ops = {
>> + .direct_access = dcssblk_dax_direct_access,
>>  };
>>
>>  struct dcssblk_dev_info {
>> @@ -57,6 +64,7 @@ struct dcssblk_dev_info {
>>   struct request_queue *dcssblk_queue;
>>   int num_of_segments;
>>   struct list_head seg_list;
>> + struct dax_device *dax_dev;
>>  };
>>
>>  struct segment_info {
>> @@ -389,6 +397,8 @@ dcssblk_shared_store(struct device *dev, struct 
>> device_attribute *attr, const ch
>>   }
>>   list_del(_info->lh);
>>
>> + kill_dax(dev_info->dax_dev);
>> + put_dax(dev_info->dax_dev);
>>   del_gendisk(dev_info->gd);
>>   blk_cleanup_queue(dev_info->dcssblk_queue);
>>   dev_info->gd->queue = NULL;
>> @@ -525,6 +535,7 @@ dcssblk_add_store(struct device *dev, struct 
>> device_attribute *attr, const char
>>   int rc, i, j, num_of_segments;
>>   struct dcssblk_dev_info *dev_info;
>>   struct segment_info *seg_info, *temp;
>> + struct dax_device *dax_dev;
>>   char *local_buf;
>>   unsigned long seg_byte_size;
>>
>> @@ -654,6 +665,11 @@ dcssblk_add_store(struct device *dev, struct 
>> device_attribute *attr, const char
>>   if (rc)
>>   goto put_dev;
>>
>> + dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
>> + _dax_ops);
>> + if (!dax_dev)
>> + goto put_dev;
>> +
>
> The returned dax_dev should be stored into dev_info->dax_dev, for later use
> by kill/put_dax(). This can also be done directly, so that we don't need the
> local dax_dev variable here.
>
> Also, in the error case, a proper rc should be set before going to put_dev,
> probably -ENOMEM.
>
> I took a quick look at the patches for the other affected drivers, and it
> looks like axonram also has the "missing rc" issue, and brd the "missing
> brd->dax_dev init" issue, pmem seems to be fine.

Thank you for taking a look. I'll get this fixed up.

Re: bfq-mq performance comparison to cfq

2017-04-19 Thread Bart Van Assche

On Wed, 2017-04-19 at 09:02 +0200, Paolo Valente wrote:
> > Il giorno 19 apr 2017, alle ore 07:01, Bart Van Assche 
> >  ha scritto:
> > What API was used by the Android application to tell the I/O scheduler 
> > to optimize for latency? Do you think that it would be sufficient if the 
> > application uses the ioprio_set() system call to set the I/O priority to 
> > IOPRIO_CLASS_RT?
> 
> That's exactly the hack we are using in our prototype.  However, it
> can only be a temporary hack, because it mixes two slightly different
> concepts: 1) the activation of weight raising and other mechanisms for
> reducing latency for the target app, 2) the assignment of a different
> priority class, which (cleanly) means just that processes in a lower
> priority class will be served only when the processes of the target
> app have no pending I/O request.  Finding a clean boosting API would
> be one of the main steps to turn our prototype into a usable solution.

Hello Paolo,

Sorry but I do not agree that you call this use of I/O priorities a hack.
I also do not agree that I/O requests submitted by processes in a lower
priority class will only be served by the I/O scheduler when there are no
pending requests in a higher class. It wouldn't be that hard to modify I/O
schedulers that support I/O priorities to avoid the starvation you referred
to. What I expect that will happen is that sooner or later a Linux
distributor will start receiving bug reports about the heuristics for
detecting interactive and streaming applications and that the person who
will work on that bug report will realize that it will be easier to remove
those heuristics from BFQ and to modify streaming applications and the
software that starts interactive applications (e.g. a window manager) to
use a higher I/O priority.

Please also note that what I described above may require to introduce
additional I/O priorities in the Linux kernel next to the existing I/O
priorities RT, BE and NONE and that this may require to map multiple of
these priorities onto the same drive priority.

Bart.

Re: [resend PATCH v2 08/33] dcssblk: add dax_operations support

2017-04-19 Thread Gerald Schaefer

On Mon, 17 Apr 2017 12:09:32 -0700
Dan Williams  wrote:

> Setup a dax_dev to have the same lifetime as the dcssblk block device
> and add a ->direct_access() method that is equivalent to
> dcssblk_direct_access(). Once fs/dax.c has been converted to use
> dax_operations the old dcssblk_direct_access() will be removed.
> 
> Cc: Gerald Schaefer 
> Signed-off-by: Dan Williams 
> ---
>  drivers/s390/block/Kconfig   |1 +
>  drivers/s390/block/dcssblk.c |   54 
> +++---
>  2 files changed, 46 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
> index 4a3b62326183..0acb8c2f9475 100644
> --- a/drivers/s390/block/Kconfig
> +++ b/drivers/s390/block/Kconfig
> @@ -14,6 +14,7 @@ config BLK_DEV_XPRAM
> 
>  config DCSSBLK
>   def_tristate m
> + select DAX
>   prompt "DCSSBLK support"
>   depends on S390 && BLOCK
>   help
> diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
> index 415d10a67b7a..682a9eb4934d 100644
> --- a/drivers/s390/block/dcssblk.c
> +++ b/drivers/s390/block/dcssblk.c
> @@ -18,6 +18,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
> 
> @@ -30,8 +31,10 @@ static int dcssblk_open(struct block_device *bdev, fmode_t 
> mode);
>  static void dcssblk_release(struct gendisk *disk, fmode_t mode);
>  static blk_qc_t dcssblk_make_request(struct request_queue *q,
>   struct bio *bio);
> -static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
> +static long dcssblk_blk_direct_access(struct block_device *bdev, sector_t 
> secnum,
>void **kaddr, pfn_t *pfn, long size);
> +static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t 
> pgoff,
> + long nr_pages, void **kaddr, pfn_t *pfn);
> 
>  static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
> 
> @@ -40,7 +43,11 @@ static const struct block_device_operations dcssblk_devops 
> = {
>   .owner  = THIS_MODULE,
>   .open   = dcssblk_open,
>   .release= dcssblk_release,
> - .direct_access  = dcssblk_direct_access,
> + .direct_access  = dcssblk_blk_direct_access,
> +};
> +
> +static const struct dax_operations dcssblk_dax_ops = {
> + .direct_access = dcssblk_dax_direct_access,
>  };
> 
>  struct dcssblk_dev_info {
> @@ -57,6 +64,7 @@ struct dcssblk_dev_info {
>   struct request_queue *dcssblk_queue;
>   int num_of_segments;
>   struct list_head seg_list;
> + struct dax_device *dax_dev;
>  };
> 
>  struct segment_info {
> @@ -389,6 +397,8 @@ dcssblk_shared_store(struct device *dev, struct 
> device_attribute *attr, const ch
>   }
>   list_del(_info->lh);
> 
> + kill_dax(dev_info->dax_dev);
> + put_dax(dev_info->dax_dev);
>   del_gendisk(dev_info->gd);
>   blk_cleanup_queue(dev_info->dcssblk_queue);
>   dev_info->gd->queue = NULL;
> @@ -525,6 +535,7 @@ dcssblk_add_store(struct device *dev, struct 
> device_attribute *attr, const char
>   int rc, i, j, num_of_segments;
>   struct dcssblk_dev_info *dev_info;
>   struct segment_info *seg_info, *temp;
> + struct dax_device *dax_dev;
>   char *local_buf;
>   unsigned long seg_byte_size;
> 
> @@ -654,6 +665,11 @@ dcssblk_add_store(struct device *dev, struct 
> device_attribute *attr, const char
>   if (rc)
>   goto put_dev;
> 
> + dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
> + _dax_ops);
> + if (!dax_dev)
> + goto put_dev;
> +

The returned dax_dev should be stored into dev_info->dax_dev, for later use
by kill/put_dax(). This can also be done directly, so that we don't need the
local dax_dev variable here.

Also, in the error case, a proper rc should be set before going to put_dev,
probably -ENOMEM.

I took a quick look at the patches for the other affected drivers, and it
looks like axonram also has the "missing rc" issue, and brd the "missing
brd->dax_dev init" issue, pmem seems to be fine.

>   get_device(_info->dev);
>   device_add_disk(_info->dev, dev_info->gd);
> 
> @@ -752,6 +768,8 @@ dcssblk_remove_store(struct device *dev, struct 
> device_attribute *attr, const ch
>   }
> 
>   list_del(_info->lh);
> + kill_dax(dev_info->dax_dev);
> + put_dax(dev_info->dax_dev);
>   del_gendisk(dev_info->gd);
>   blk_cleanup_queue(dev_info->dcssblk_queue);
>   dev_info->gd->queue = NULL;
> @@ -883,21 +901,39 @@ dcssblk_make_request(struct request_queue *q, struct 
> bio *bio)
>  }
> 
>  static long
> -dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
> +__dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
> + long nr_pages, void **kaddr, pfn_t *pfn)
> +{
> + resource_size_t offset = pgoff * PAGE_SIZE;
> +

Re: [PATCH 5/8] nowait aio: return on congested block device

2017-04-19 Thread Goldwyn Rodrigues



On 04/19/2017 01:45 AM, Christoph Hellwig wrote:
> 
>> +if (bio->bi_opf & REQ_NOWAIT) {
>> +if (!blk_queue_nowait(q)) {
>> +err = -EOPNOTSUPP;
>> +goto end_io;
>> +}
>> +if (!(bio->bi_opf & REQ_SYNC)) {
> 
> I don't understand this check at all..

It is to ensure at the block layer that NOWAIT comes only for DIRECT
calls only. I should probably change it to REQ_SYNC | REQ_IDLE.

> 
>> +if (unlikely(!blk_queue_dying(q) && (bio->bi_opf & 
>> REQ_NOWAIT)))
> 
> Please break lines after 80 characters.
> 
>> @@ -119,6 +119,9 @@ struct request *blk_mq_sched_get_request(struct 
>> request_queue *q,
>>  if (likely(!data->hctx))
>>  data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
>>  
>> +if (bio && (bio->bi_opf & REQ_NOWAIT))
>> +data->flags |= BLK_MQ_REQ_NOWAIT;
> 
> Check the op flag again here.
> 
>> +++ b/block/blk-mq.c
>> @@ -1538,6 +1538,8 @@ static blk_qc_t blk_mq_make_request(struct 
>> request_queue *q, struct bio *bio)
>>  rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
>>  if (unlikely(!rq)) {
>>  __wbt_done(q->rq_wb, wb_acct);
>> +if (bio && (bio->bi_opf & REQ_NOWAIT))
>> +bio_wouldblock_error(bio);
> 
> bio iѕ dereferences unconditionally above, so you can do the same.
> 
>> @@ -1662,6 +1664,8 @@ static blk_qc_t blk_sq_make_request(struct 
>> request_queue *q, struct bio *bio)
>>  rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
>>  if (unlikely(!rq)) {
>>  __wbt_done(q->rq_wb, wb_acct);
>> +if (bio && (bio->bi_opf & REQ_NOWAIT))
>> +bio_wouldblock_error(bio);
> 
> Same here.  Although blk_sq_make_request is gone anyway in the current
> block tree..
> 
>> +/* Request queue supports BIO_NOWAIT */
>> +queue_flag_set_unlocked(QUEUE_FLAG_NOWAIT, q);
> 
> BIO_NOWAIT is gone.  And the comment would not be needed if the
> flag had a more descriptive name, e.g. QUEUE_FLAG_NOWAIT_SUPPORT.
> 
> And I think all request based drivers should set the flag implicitly
> as ->queuecommand can't sleep, and ->queue_rq only when it's always
> offloaded to a workqueue when the BLK_MQ_F_BLOCKING flag is set.
> 

Yes, Do we have a central point (like a probe() function call?) where
this can be done?


-- 
Goldwyn

Re: RFC: drop the T10 OSD code and its users

2017-04-19 Thread Jens Axboe

On Wed, Apr 12 2017, Christoph Hellwig wrote:
> The only real user of the T10 OSD protocol, the pNFS object layout
> driver never went to the point of having shipping products, and the
> other two users (osdblk and exofs) were simple example of it's usage.
> 
> The code has been mostly unmaintained for years and is getting in the
> way of block / SCSI changes, so I think it's finally time to drop it.
> 
> These patches are against Jens' block for-next tree as that already

I've added 1/4 for now, as that one should obviously go. If we have
consensus to kill the rest, I'm all for it.

-- 
Jens Axboe

Re: [PATCH 26/27] scsi: sd: Separate zeroout and discard command choices

2017-04-19 Thread Paolo Bonzini



On 05/04/2017 19:21, Christoph Hellwig wrote:
> +static ssize_t
> +zeroing_mode_store(struct device *dev, struct device_attribute *attr,
> +const char *buf, size_t count)
> +{
> + struct scsi_disk *sdkp = to_scsi_disk(dev);
> +
> + if (!capable(CAP_SYS_ADMIN))
> + return -EACCES;
> +
> +
> + if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
> + sdkp->zeroing_mode = SD_ZERO_WRITE;
> + else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
> + sdkp->zeroing_mode = SD_ZERO_WS;
> + else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
> + sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
> + else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
> + sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;

Should this be conditional on lbprz, lbpws, lbpws10 and max_ws_blocks?

Thanks,

Paolo

Re: [PATCH 4/5] block: Inline blk_rq_set_prio()

2017-04-19 Thread Jens Axboe

On 04/19/2017 12:20 AM, Christoph Hellwig wrote:
>> +req->ioprio = ioprio_valid(bio_prio(bio)) ? bio_prio(bio) : ioc ?
>> +ioc->ioprio : IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
> 
> I think this would be a tad cleaner with a traditional if / else if / else
> chain, e.g.
> 
>   if (ioprio_valid(bio_prio(bio)))
>   req->ioprio = bio_prio(bio);
>   else if (ioc)
>   req->ioprio = ioc->ioprio;
>   else
>   req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);

Agree, I hate ternaries with a vengeance, and a doubly nested one is
almost a shooting offense.

Bart, care to respin with this fixed and the GPL export modification?

-- 
Jens Axboe

Re: [PATCH] block: Make writeback throttling defaults consistent for SQ devices

2017-04-19 Thread Jens Axboe

On 04/19/2017 03:33 AM, Jan Kara wrote:
> When CFQ is used as an elevator, it disables writeback throttling
> because they don't play well together. Later when a different elevator
> is chosen for the device, writeback throttling doesn't get enabled
> again as it should. Make sure CFQ enables writeback throttling (if it
> should be enabled by default) when we switch from it to another IO
> scheduler.

Thanks Jan, added.

-- 
Jens Axboe

Re: [PATCH V4 00/16] Introduce the BFQ I/O scheduler

2017-04-19 Thread Jens Axboe

On 04/19/2017 03:23 AM, Paolo Valente wrote:
> 
>> Il giorno 12 apr 2017, alle ore 18:23, Paolo Valente 
>>  ha scritto:
>>
>> Hi,
>> new patch series, addressing (both) issues raised by Bart [1], and
>> with block/Makefile fixed as suggested by Bart [2].
>>
> 
> Hi Jens,
> apparently no complain of any sort on this last series.  Do you think
> we could make it for 4.12, or shall we aim at 4.13?

We may as well queue it up now, I don't think there's much point in
deferring an extra cycle.

-- 
Jens Axboe

Re: [PATCH] nbd: set the max segment size to UINT_MAX

2017-04-19 Thread Jens Axboe

On 04/18/2017 02:22 PM, Josef Bacik wrote:
> NBD doesn't care about limiting the segment size, let the user push the
> largest bio's they want.  This allows us to control the request size
> solely through max_sectors_kb.

Thanks Josef, applied for 4.12.

-- 
Jens Axboe

Re: [GIT PULL] (xen) stable/for-jens-4.12

2017-04-19 Thread Jens Axboe

On 04/18/2017 12:31 PM, Konrad Rzeszutek Wilk wrote:
> Hey Jens,
> 
> Please git pull the following branch:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git 
> stable/for-jens-4.12
> 
> which is based on your 'for-4.12/block' branch. It has one fix - to emit an
> uevent whenever the size of the guest disk image changes.

Pulled, thanks.

-- 
Jens Axboe

Re: [PATCH 2/8] nowait aio: Introduce RWF_NOWAIT

2017-04-19 Thread Jan Kara

On Wed 19-04-17 05:30:24, Goldwyn Rodrigues wrote:
> 
> 
> On 04/19/2017 01:39 AM, Christoph Hellwig wrote:
> > 
> >> @@ -1593,6 +1593,11 @@ static int io_submit_one(struct kioctx *ctx, struct 
> >> iocb __user *user_iocb,
> >>}
> >>  
> >>req->common.ki_flags |= iocb_rw_flags(iocb->aio_rw_flags);
> >> +  if ((req->common.ki_flags & IOCB_NOWAIT) &&
> >> +  !(req->common.ki_flags & IOCB_DIRECT)) {
> >> +  ret = -EINVAL;
> >> +  goto out_put_req;
> >> +  }
> > 
> > Wrong indentation.  Also I think this should be EOPNOTSUPP here.
> > 
> 
> Do we plan to add support for nowait in buffered I/O in the future? It
> is just too complicated. EINVAL suits best in this case.

Well, it may be complicated in the current implementation but IMO it makes
sense to ask for such functionality (see e.g. patches for non-blocking
buffered reads that were flying around year or two ago). So I agree with
Christoph that EOPNOTSUPP is more logical.

Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH 2/8] nowait aio: Introduce RWF_NOWAIT

2017-04-19 Thread Goldwyn Rodrigues



On 04/19/2017 01:39 AM, Christoph Hellwig wrote:
> 
>> @@ -1593,6 +1593,11 @@ static int io_submit_one(struct kioctx *ctx, struct 
>> iocb __user *user_iocb,
>>  }
>>  
>>  req->common.ki_flags |= iocb_rw_flags(iocb->aio_rw_flags);
>> +if ((req->common.ki_flags & IOCB_NOWAIT) &&
>> +!(req->common.ki_flags & IOCB_DIRECT)) {
>> +ret = -EINVAL;
>> +goto out_put_req;
>> +}
> 
> Wrong indentation.  Also I think this should be EOPNOTSUPP here.
> 

Do we plan to add support for nowait in buffered I/O in the future? It
is just too complicated. EINVAL suits best in this case.

-- 
Goldwyn

Re: [PATCH] block: Make writeback throttling defaults consistent for SQ devices

2017-04-19 Thread Jan Kara

On Wed 19-04-17 11:33:27, Jan Kara wrote:
> When CFQ is used as an elevator, it disables writeback throttling
> because they don't play well together. Later when a different elevator
> is chosen for the device, writeback throttling doesn't get enabled
> again as it should. Make sure CFQ enables writeback throttling (if it
> should be enabled by default) when we switch from it to another IO
> scheduler.
> 
> Signed-off-by: Jan Kara 

Forgot to change header to v2 so I'm rather writing it here so that it's
clear.

Honza

> ---
>  block/blk-sysfs.c | 19 +--
>  block/blk-wbt.c   | 19 +++
>  block/blk-wbt.h   |  4 
>  block/elevator.c  |  3 +++
>  4 files changed, 27 insertions(+), 18 deletions(-)
> 
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index fc20489f0d2b..f85723332288 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -844,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
>   .release= blk_release_queue,
>  };
>  
> -static void blk_wb_init(struct request_queue *q)
> -{
> -#ifndef CONFIG_BLK_WBT_MQ
> - if (q->mq_ops)
> - return;
> -#endif
> -#ifndef CONFIG_BLK_WBT_SQ
> - if (q->request_fn)
> - return;
> -#endif
> -
> - /*
> -  * If this fails, we don't get throttling
> -  */
> - wbt_init(q);
> -}
> -
>  int blk_register_queue(struct gendisk *disk)
>  {
>   int ret;
> @@ -908,7 +891,7 @@ int blk_register_queue(struct gendisk *disk)
>  
>   kobject_uevent(>kobj, KOBJ_ADD);
>  
> - blk_wb_init(q);
> + wbt_enable_default(q);
>  
>   blk_throtl_register_queue(q);
>  
> diff --git a/block/blk-wbt.c b/block/blk-wbt.c
> index b3b79149d3a0..26e1bb617877 100644
> --- a/block/blk-wbt.c
> +++ b/block/blk-wbt.c
> @@ -665,6 +665,25 @@ void wbt_disable_default(struct request_queue *q)
>  }
>  EXPORT_SYMBOL_GPL(wbt_disable_default);
>  
> +/*
> + * Enable wbt if defaults are configured that way
> + */
> +void wbt_enable_default(struct request_queue *q)
> +{
> + /* Throttling already enabled? */
> + if (q->rq_wb)
> + return;
> +
> + /* Queue not registered? Maybe shutting down... */
> + if (!test_bit(QUEUE_FLAG_REGISTERED, >queue_flags))
> + return;
> +
> + if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
> + (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
> + wbt_init(q);
> +}
> +EXPORT_SYMBOL_GPL(wbt_enable_default);
> +
>  u64 wbt_default_latency_nsec(struct request_queue *q)
>  {
>   /*
> diff --git a/block/blk-wbt.h b/block/blk-wbt.h
> index ad6c78507c3a..df6de50c5d59 100644
> --- a/block/blk-wbt.h
> +++ b/block/blk-wbt.h
> @@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
>  void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
>  void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
>  void wbt_disable_default(struct request_queue *);
> +void wbt_enable_default(struct request_queue *);
>  
>  void wbt_set_queue_depth(struct rq_wb *, unsigned int);
>  void wbt_set_write_cache(struct rq_wb *, bool);
> @@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct 
> blk_issue_stat *stat)
>  static inline void wbt_disable_default(struct request_queue *q)
>  {
>  }
> +static inline void wbt_enable_default(struct request_queue *q)
> +{
> +}
>  static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
>  {
>  }
> diff --git a/block/elevator.c b/block/elevator.c
> index dbeecf7be719..fb50416b5aae 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -41,6 +41,7 @@
>  
>  #include "blk.h"
>  #include "blk-mq-sched.h"
> +#include "blk-wbt.h"
>  
>  static DEFINE_SPINLOCK(elv_list_lock);
>  static LIST_HEAD(elv_list);
> @@ -877,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
>   kobject_uevent(>kobj, KOBJ_REMOVE);
>   kobject_del(>kobj);
>   e->registered = 0;
> + /* Re-enable throttling in case elevator disabled it */
> + wbt_enable_default(q);
>   }
>  }
>  EXPORT_SYMBOL(elv_unregister_queue);
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR

[PATCH] block: Make writeback throttling defaults consistent for SQ devices

2017-04-19 Thread Jan Kara

When CFQ is used as an elevator, it disables writeback throttling
because they don't play well together. Later when a different elevator
is chosen for the device, writeback throttling doesn't get enabled
again as it should. Make sure CFQ enables writeback throttling (if it
should be enabled by default) when we switch from it to another IO
scheduler.

Signed-off-by: Jan Kara 
---
 block/blk-sysfs.c | 19 +--
 block/blk-wbt.c   | 19 +++
 block/blk-wbt.h   |  4 
 block/elevator.c  |  3 +++
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fc20489f0d2b..f85723332288 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -844,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
.release= blk_release_queue,
 };
 
-static void blk_wb_init(struct request_queue *q)
-{
-#ifndef CONFIG_BLK_WBT_MQ
-   if (q->mq_ops)
-   return;
-#endif
-#ifndef CONFIG_BLK_WBT_SQ
-   if (q->request_fn)
-   return;
-#endif
-
-   /*
-* If this fails, we don't get throttling
-*/
-   wbt_init(q);
-}
-
 int blk_register_queue(struct gendisk *disk)
 {
int ret;
@@ -908,7 +891,7 @@ int blk_register_queue(struct gendisk *disk)
 
kobject_uevent(>kobj, KOBJ_ADD);
 
-   blk_wb_init(q);
+   wbt_enable_default(q);
 
blk_throtl_register_queue(q);
 
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index b3b79149d3a0..26e1bb617877 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -665,6 +665,25 @@ void wbt_disable_default(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(wbt_disable_default);
 
+/*
+ * Enable wbt if defaults are configured that way
+ */
+void wbt_enable_default(struct request_queue *q)
+{
+   /* Throttling already enabled? */
+   if (q->rq_wb)
+   return;
+
+   /* Queue not registered? Maybe shutting down... */
+   if (!test_bit(QUEUE_FLAG_REGISTERED, >queue_flags))
+   return;
+
+   if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
+   (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
+   wbt_init(q);
+}
+EXPORT_SYMBOL_GPL(wbt_enable_default);
+
 u64 wbt_default_latency_nsec(struct request_queue *q)
 {
/*
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index ad6c78507c3a..df6de50c5d59 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
 void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
 void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
 void wbt_disable_default(struct request_queue *);
+void wbt_enable_default(struct request_queue *);
 
 void wbt_set_queue_depth(struct rq_wb *, unsigned int);
 void wbt_set_write_cache(struct rq_wb *, bool);
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct 
blk_issue_stat *stat)
 static inline void wbt_disable_default(struct request_queue *q)
 {
 }
+static inline void wbt_enable_default(struct request_queue *q)
+{
+}
 static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
 {
 }
diff --git a/block/elevator.c b/block/elevator.c
index dbeecf7be719..fb50416b5aae 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -41,6 +41,7 @@
 
 #include "blk.h"
 #include "blk-mq-sched.h"
+#include "blk-wbt.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -877,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
kobject_uevent(>kobj, KOBJ_REMOVE);
kobject_del(>kobj);
e->registered = 0;
+   /* Re-enable throttling in case elevator disabled it */
+   wbt_enable_default(q);
}
 }
 EXPORT_SYMBOL(elv_unregister_queue);
-- 
2.12.0

Re: [PATCH V4 00/16] Introduce the BFQ I/O scheduler

2017-04-19 Thread Paolo Valente


> Il giorno 12 apr 2017, alle ore 18:23, Paolo Valente 
>  ha scritto:
> 
> Hi,
> new patch series, addressing (both) issues raised by Bart [1], and
> with block/Makefile fixed as suggested by Bart [2].
> 

Hi Jens,
apparently no complain of any sort on this last series.  Do you think
we could make it for 4.12, or shall we aim at 4.13?

Thanks,
Paolo

> Thanks,
> Paolo
> 
> [1] https://lkml.org/lkml/2017/3/31/393
> [2] https://lkml.org/lkml/2017/4/12/502
> 
> Arianna Avanzini (4):
>  block, bfq: add full hierarchical scheduling and cgroups support
>  block, bfq: add Early Queue Merge (EQM)
>  block, bfq: reduce idling only in symmetric scenarios
>  block, bfq: handle bursts of queue activations
> 
> Paolo Valente (12):
>  block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler
>  block, bfq: improve throughput boosting
>  block, bfq: modify the peak-rate estimator
>  block, bfq: add more fairness with writes and slow processes
>  block, bfq: improve responsiveness
>  block, bfq: reduce I/O latency for soft real-time applications
>  block, bfq: preserve a low latency also with NCQ-capable drives
>  block, bfq: reduce latency during request-pool saturation
>  block, bfq: boost the throughput on NCQ-capable flash-based devices
>  block, bfq: boost the throughput with random I/O on NCQ-capable HDDs
>  block, bfq: remove all get and put of I/O contexts
>  block, bfq: split bfq-iosched.c into multiple source files
> 
> Documentation/block/00-INDEX|2 +
> Documentation/block/bfq-iosched.txt |  531 
> block/Kconfig.iosched   |   21 +
> block/Makefile  |2 +
> block/bfq-cgroup.c  | 1139 
> block/bfq-iosched.c | 5047 +++
> block/bfq-iosched.h |  942 +++
> block/bfq-wf2q.c| 1616 +++
> include/linux/blkdev.h  |2 +-
> 9 files changed, 9301 insertions(+), 1 deletion(-)
> create mode 100644 Documentation/block/bfq-iosched.txt
> create mode 100644 block/bfq-cgroup.c
> create mode 100644 block/bfq-iosched.c
> create mode 100644 block/bfq-iosched.h
> create mode 100644 block/bfq-wf2q.c
> 
> --
> 2.10.0

Re: [PATCH 0/3] mtip32xx: make it working with mq scheduler

2017-04-19 Thread Ming Lei

On Tue, Apr 18, 2017 at 11:27:13PM -0700, Christoph Hellwig wrote:
> Hi Ming,
> 
> I don't think your patch goes int the right direction.  The mtip32xx
> driver never submits the requests it allocates using blk_mq_alloc_request.
> So to fix this it should simply kmalloc the request, set a tag aside
> and not use a block layer request for it at all, similar to what nvme
> does for the AER command.

That is only about the internal command part(patch 1 and patch 2),
and the patch 3 is required regardless.

I choose to use blk_mq_get_driver_tag() for the 1st fix becasue it is the
simpleset way, also given blk_mq_alloc_request() has been working on that
for long time.

If We do that via kmalloc, the command has to be initialzed as the way in
.init_cmd(), the reserved tag space has to be accessed exclusively like what
block layer provides, and mtip_cmd_from_tag() need to be fixed too.
We still have to pass 'reserved_tags = 1' to blk-mq anyway.

That is why I suggest to use blk_mq_get_driver_tag() to fix the issue.

Thanks,
Ming

Re: [PATCH 1/4] block: remove the osdblk driver

2017-04-19 Thread Boaz Harrosh

On 04/12/2017 07:01 PM, Christoph Hellwig wrote:
> This was just a proof of concept user for the SCSI OSD library, and
> never had any real users.
> 
> Signed-off-by: Christoph Hellwig 

Yes please remove this driver

ACK-by Boaz Harrosh 

> ---
>  drivers/block/Kconfig  |  16 --
>  drivers/block/Makefile |   1 -
>  drivers/block/osdblk.c | 693 
> -
>  3 files changed, 710 deletions(-)
>  delete mode 100644 drivers/block/osdblk.c
> 
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index a1c2e816128f..58e24c354933 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -312,22 +312,6 @@ config BLK_DEV_SKD
>  
>   Use device /dev/skd$N amd /dev/skd$Np$M.
>  
> -config BLK_DEV_OSD
> - tristate "OSD object-as-blkdev support"
> - depends on SCSI_OSD_ULD
> - ---help---
> -   Saying Y or M here will allow the exporting of a single SCSI
> -   OSD (object-based storage) object as a Linux block device.
> -
> -   For example, if you create a 2G object on an OSD device,
> -   you can then use this module to present that 2G object as
> -   a Linux block device.
> -
> -   To compile this driver as a module, choose M here: the
> -   module will be called osdblk.
> -
> -   If unsure, say N.
> -
>  config BLK_DEV_SX8
>   tristate "Promise SATA SX8 support"
>   depends on PCI
> diff --git a/drivers/block/Makefile b/drivers/block/Makefile
> index b12c772bbeb3..42e8cee1cbc7 100644
> --- a/drivers/block/Makefile
> +++ b/drivers/block/Makefile
> @@ -22,7 +22,6 @@ obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
>  obj-$(CONFIG_MG_DISK)+= mg_disk.o
>  obj-$(CONFIG_SUNVDC) += sunvdc.o
>  obj-$(CONFIG_BLK_DEV_SKD)+= skd.o
> -obj-$(CONFIG_BLK_DEV_OSD)+= osdblk.o
>  
>  obj-$(CONFIG_BLK_DEV_UMEM)   += umem.o
>  obj-$(CONFIG_BLK_DEV_NBD)+= nbd.o
> diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
> deleted file mode 100644
> index 8127b8201a01..
> --- a/drivers/block/osdblk.c
> +++ /dev/null
> @@ -1,693 +0,0 @@
> -
> -/*
> -   osdblk.c -- Export a single SCSI OSD object as a Linux block device
> -
> -
> -   Copyright 2009 Red Hat, Inc.
> -
> -   This program is free software; you can redistribute it and/or modify
> -   it under the terms of the GNU General Public License as published by
> -   the Free Software Foundation.
> -
> -   This program is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> -   GNU General Public License for more details.
> -
> -   You should have received a copy of the GNU General Public License
> -   along with this program; see the file COPYING.  If not, write to
> -   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
> -
> -
> -   Instructions for use
> -   
> -
> -   1) Map a Linux block device to an existing OSD object.
> -
> -  In this example, we will use partition id 1234, object id 5678,
> -  OSD device /dev/osd1.
> -
> -  $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
> -
> -
> -   2) List all active blkdev<->object mappings.
> -
> -  In this example, we have performed step #1 twice, creating two blkdevs,
> -  mapped to two separate OSD objects.
> -
> -  $ cat /sys/class/osdblk/list
> -  0 174 1234 5678 /dev/osd1
> -  1 179 1994 897123 /dev/osd0
> -
> -  The columns, in order, are:
> -  - blkdev unique id
> -  - blkdev assigned major
> -  - OSD object partition id
> -  - OSD object id
> -  - OSD device
> -
> -
> -   3) Remove an active blkdev<->object mapping.
> -
> -  In this example, we remove the mapping with blkdev unique id 1.
> -
> -  $ echo 1 > /sys/class/osdblk/remove
> -
> -
> -   NOTE:  The actual creation and deletion of OSD objects is outside the 
> scope
> -   of this driver.
> -
> - */
> -
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -
> -#define DRV_NAME "osdblk"
> -#define PFX DRV_NAME ": "
> -
> -/* #define _OSDBLK_DEBUG */
> -#ifdef _OSDBLK_DEBUG
> -#define OSDBLK_DEBUG(fmt, a...) \
> - printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
> -#else
> -#define OSDBLK_DEBUG(fmt, a...) \
> - do { if (0) printk(fmt, ##a); } while (0)
> -#endif
> -
> -MODULE_AUTHOR("Jeff Garzik ");
> -MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
> -MODULE_LICENSE("GPL");
> -
> -struct osdblk_device;
> -
> -enum {
> - OSDBLK_MINORS_PER_MAJOR = 256,  /* max minors per blkdev */
> - OSDBLK_MAX_REQ  = 32,   /* max parallel requests */
> - OSDBLK_OP_TIMEOUT   = 4 * 60,   /* sync OSD req timeout */
> -};
> -
> -struct osdblk_request {
> - struct request  *rq;

Re: [PATCH 4/8] nowait-aio: Introduce IOMAP_NOWAIT

2017-04-19 Thread Jan Kara

On Fri 14-04-17 07:02:53, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps.
> This is used by XFS in the XFS patch.

Goldwyn, the patch is missing your Signed-off-by...

Honza

> ---
>  fs/iomap.c| 2 ++
>  include/linux/iomap.h | 1 +
>  2 files changed, 3 insertions(+)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 141c3cd55a8b..d1c81753d411 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -885,6 +885,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>   } else {
>   dio->flags |= IOMAP_DIO_WRITE;
>   flags |= IOMAP_WRITE;
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + flags |= IOMAP_NOWAIT;
>   }
>  
>   if (mapping->nrpages) {
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 7291810067eb..53f6af89c625 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -51,6 +51,7 @@ struct iomap {
>  #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */
>  #define IOMAP_FAULT  (1 << 3) /* mapping for page fault */
>  #define IOMAP_DIRECT (1 << 4) /* direct I/O */
> +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */
>  
>  struct iomap_ops {
>   /*
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH V3 02/16] block, bfq: add full hierarchical scheduling and cgroups support

2017-04-19 Thread Paolo Valente


> Il giorno 19 apr 2017, alle ore 07:33, Paolo Valente 
>  ha scritto:
> 
>> 
>> Il giorno 18 apr 2017, alle ore 09:04, Tejun Heo  ha 
>> scritto:
>> 
>> Hello, Paolo.
>> 
>> On Wed, Apr 12, 2017 at 07:22:03AM +0200, Paolo Valente wrote:
>>> could you elaborate a bit more on this?  I mean, cgroups support has
>>> been in BFQ (and CFQ) for almost ten years, perfectly working as far
>>> as I know.  Of course it is perfectly working in terms of I/O and not
>>> of CPU bandwidth distribution; and, for the moment, it is effective
>>> only for devices below 30-50KIOPS.  What's the point in throwing
>>> (momentarily?) away such a fundamental feature?  What am I missing?
>> 
>> I've been trying to track down latency issues with the CPU controller
>> which basically takes the same approach and I'm not sure nesting
>> scheduler timelines is a good approach.  It intuitively feels elegant
>> but seems to have some fundamental issues.  IIUC, bfq isn't quite the
>> same in that it doesn't need load balancer across multiple queues and
>> it could be that bfq is close enough to the basic model that the
>> nested behavior maps to the correct scheduling behavior.
>> 
>> However, for example, in the CPU controller, the nested timelines
>> break sleeper boost.  The boost is implemented by considering the
>> thread to have woken up upto some duration prior to the current time;
>> however, it only affects the timeline inside the cgroup and there's no
>> good way to propagate it upwards.  The final result is two threads in
>> a cgroup with the double weight can behave significantly worse in
>> terms of latency compared to two threads with the weight of 1 in the
>> root.
>> 
> 
> Hi Tejun,
> I don't know in detail the specific multiple-queue issues you report,
> but bfq implements the upward propagation you mention: if a process in
> a group is to be privileged, i.e., if the process has basically to be
> provided with a higher weight (in addition to other important forms of
> help), then this weight boost is propagated upward through the path
> from the process to the root node in the group hierarchy.
> 

ERRATA CORRIGE: actually, this propagation is implemented in a simple
variant of bfq that I made for a virtualization company (to truly
guarantee a low latency to the processes in a guest OS, regardless of
the load in the host).  The base version of bfq in these patches
contains all the mechanisms needed to get this propagation, but
doesn't modify group weights autonomously.

Paolo

>> Given that the nested scheduling ends up pretty expensive, I'm not
>> sure how good a model this nesting approach is.  Especially if there
>> can be multiple queues, the weight distribution across cgroup
>> instances across multiple queues has to be coordinated globally
>> anyway,
> 
> To get perfect global service guarantees, yes.  But you can settle
> with tradeoffs that, according to my experience with storage and
> packet I/O, are so good to be probably indistinguishable from an
> ideal, but too costly solution.  I mean, with a well-done approximated
> scheduling solution, the deviation with respect to an ideal service
> can be in the same order of the noise caused by unavoidable latencies
> of other sw and hw components than the scheduler.
> 
>> so the weight / cost adjustment part can't happen
>> automatically anyway as in single queue case.  If we're going there,
>> we might as well implement cgroup support by actively modulating the
>> combined weights, which will make individual scheduling operations
>> cheaper and it easier to think about and guarantee latency behaviors.
>> 
> 
> Yes.  Anyway, I didn't quite understand what is or could be the
> alternative, w.r.t. hierarchical scheduling, for guaranteeing
> bandwidth distribution of shared resources in a complex setting.  If
> you think I could be of any help on this, just put me somehow in the
> loop.
> 
>> If you think that bfq will stay single queue and won't need timeline
>> modifying heuristics (for responsiveness or whatever), the current
>> approach could be fine, but I'm a bit awry about committing to the
>> current approach if we're gonna encounter the same problems.
>> 
> 
> As of now, bfq is targeted at not too fast devices (< 30-50KIOPS),
> which happen to be single queue.  In particular, bfq is currently
> agnostic w.r.t.  to the number of downstream queues.
> 
> Thanks,
> Paolo
> 
>> Thanks.
>> 
>> -- 
>> tejun

Re: bfq-mq performance comparison to cfq

2017-04-19 Thread Paolo Valente


> Il giorno 19 apr 2017, alle ore 07:01, Bart Van Assche 
>  ha scritto:
> 
> On 04/11/17 00:29, Paolo Valente wrote:
>> 
>>> Il giorno 10 apr 2017, alle ore 17:15, Bart Van Assche 
>>>  ha scritto:
>>> 
>>> On Mon, 2017-04-10 at 11:55 +0200, Paolo Valente wrote:
 That said, if you do always want maximum throughput, even at the
 expense of latency, then just switch off low-latency heuristics, i.e.,
 set low_latency to 0.  Depending on the device, setting slice_ilde to
 0 may help a lot too (as well as with CFQ).  If the throughput is
 still low also after forcing BFQ to an only-throughput mode, then you
 hit some bug, and I'll have a little more work to do ...
>>> 
>>> Has it been considered to make applications tell the I/O scheduler
>>> whether to optimize for latency or for throughput? It shouldn't be that
>>> hard for window managers and shells to figure out whether or not a new
>>> application that is being started is interactive or not. This would
>>> require a mechanism that allows applications to provide such information
>>> to the I/O scheduler. Wouldn't that be a better approach than the I/O
>>> scheduler trying to guess whether or not an application is an interactive
>>> application?
>> 
>> IMO that would be an (or maybe the) optimal solution, in terms of both
>> throughput and latency.  We have even developed a prototype doing what
>> you propose, for Android.  Unfortunately, I have not yet succeeded in
>> getting support, to turn it into candidate production code, or to make
>> a similar solution for lsb-compliant systems.
> 
> Hello Paolo,
> 
> What API was used by the Android application to tell the I/O scheduler 
> to optimize for latency? Do you think that it would be sufficient if the 
> application uses the ioprio_set() system call to set the I/O priority to 
> IOPRIO_CLASS_RT?
> 

That's exactly the hack we are using in our prototype.  However, it
can only be a temporary hack, because it mixes two slightly different
concepts: 1) the activation of weight raising and other mechanisms for
reducing latency for the target app, 2) the assignment of a different
priority class, which (cleanly) means just that processes in a lower
priority class will be served only when the processes of the target
app have no pending I/O request.  Finding a clean boosting API would
be one of the main steps to turn our prototype into a usable solution.

Thanks,
Paolo

> Thanks,
> 
> Bart.

Re: [PATCH 7/8] nowait aio: xfs

2017-04-19 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH 5/8] nowait aio: return on congested block device

2017-04-19 Thread Christoph Hellwig

On Fri, Apr 14, 2017 at 07:02:54AM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> A new bio operation flag REQ_NOWAIT is introduced to identify bio's

s/bio/block/

> @@ -1232,6 +1232,11 @@ static struct request *get_request(struct 
> request_queue *q, unsigned int op,
>   if (!IS_ERR(rq))
>   return rq;
>  
> + if (bio && (bio->bi_opf & REQ_NOWAIT)) {
> + blk_put_rl(rl);
> + return ERR_PTR(-EAGAIN);
> + }

Please check the op argument instead of touching bio.

> + if (bio->bi_opf & REQ_NOWAIT) {
> + if (!blk_queue_nowait(q)) {
> + err = -EOPNOTSUPP;
> + goto end_io;
> + }
> + if (!(bio->bi_opf & REQ_SYNC)) {

I don't understand this check at all..

> + if (unlikely(!blk_queue_dying(q) && (bio->bi_opf & 
> REQ_NOWAIT)))

Please break lines after 80 characters.

> @@ -119,6 +119,9 @@ struct request *blk_mq_sched_get_request(struct 
> request_queue *q,
>   if (likely(!data->hctx))
>   data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
>  
> + if (bio && (bio->bi_opf & REQ_NOWAIT))
> + data->flags |= BLK_MQ_REQ_NOWAIT;

Check the op flag again here.

> +++ b/block/blk-mq.c
> @@ -1538,6 +1538,8 @@ static blk_qc_t blk_mq_make_request(struct 
> request_queue *q, struct bio *bio)
>   rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
>   if (unlikely(!rq)) {
>   __wbt_done(q->rq_wb, wb_acct);
> + if (bio && (bio->bi_opf & REQ_NOWAIT))
> + bio_wouldblock_error(bio);

bio iѕ dereferences unconditionally above, so you can do the same.

> @@ -1662,6 +1664,8 @@ static blk_qc_t blk_sq_make_request(struct 
> request_queue *q, struct bio *bio)
>   rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
>   if (unlikely(!rq)) {
>   __wbt_done(q->rq_wb, wb_acct);
> + if (bio && (bio->bi_opf & REQ_NOWAIT))
> + bio_wouldblock_error(bio);

Same here.  Although blk_sq_make_request is gone anyway in the current
block tree..

> + /* Request queue supports BIO_NOWAIT */
> + queue_flag_set_unlocked(QUEUE_FLAG_NOWAIT, q);

BIO_NOWAIT is gone.  And the comment would not be needed if the
flag had a more descriptive name, e.g. QUEUE_FLAG_NOWAIT_SUPPORT.

And I think all request based drivers should set the flag implicitly
as ->queuecommand can't sleep, and ->queue_rq only when it's always
offloaded to a workqueue when the BLK_MQ_F_BLOCKING flag is set.

> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -480,8 +480,12 @@ static int dio_bio_complete(struct dio *dio, struct bio 
> *bio)
>   unsigned i;
>   int err;
>  
> - if (bio->bi_error)
> - dio->io_error = -EIO;
> + if (bio->bi_error) {
> + if (bio->bi_opf & REQ_NOWAIT)
> + dio->io_error = -EAGAIN;
> + else
> + dio->io_error = -EIO;
> + }

Huh?  Once REQ_NOWAIT is set all errors are -EAGAIN?

1 2 >

1 - 100 of 112 matches

Mail list logo