date:20170915

[PATCH V2 6/7] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Ming Lei

The behind idea is simple:

1) for none scheduler, driver tag has to be borrowed for flush
rq, otherwise we may run out of tag, and IO hang is caused.
get/put driver tag is actually a nop, so reorder tags isn't
necessary at all.

2) for real I/O scheduler, we needn't to allocate driver tag
beforehand for flush rq, and it works just fine to follow the
way for normal requests: allocate driver tag for each rq just
before calling .queue_rq().

One visible change to driver is that the driver tag isn't
shared in the flush request sequence, that won't be a
problem, since we always do that in legacy path.

Then flush rq isn't treated specially wrt. get/put driver tag,
code gets cleanup much, such as, reorder_tags_to_front() is
removed, and we needn't to worry about request order in
dispatch list for avoiding I/O deadlock.

Signed-off-by: Ming Lei 
---
 block/blk-flush.c| 41 +++--
 block/blk-mq-sched.c | 36 ++--
 block/blk-mq.c   | 44 ++--
 3 files changed, 47 insertions(+), 74 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index a9773d2075ac..65914a23fa77 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -216,6 +216,17 @@ static bool blk_flush_complete_seq(struct request *rq,
return kicked | queued;
 }
 
+/*
+ * We don't share tag between flush rq and data rq in case of
+ * IO scheduler, so have to release tag and restart queue
+ */
+static void put_flush_driver_tag(struct blk_mq_hw_ctx *hctx,
+struct request *rq)
+{
+   blk_mq_put_driver_tag_hctx(hctx, rq);
+   blk_mq_sched_restart(hctx);
+}
+
 static void flush_end_io(struct request *flush_rq, blk_status_t error)
 {
struct request_queue *q = flush_rq->q;
@@ -231,8 +242,13 @@ static void flush_end_io(struct request *flush_rq, 
blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(>mq_flush_lock, flags);
hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-   blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-   flush_rq->tag = -1;
+   if (!q->elevator) {
+   blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+   flush_rq->tag = -1;
+   } else {
+   put_flush_driver_tag(hctx, flush_rq);
+   flush_rq->internal_tag = -1;
+   }
}
 
running = >flush_queue[fq->flush_running_idx];
@@ -321,16 +337,24 @@ static bool blk_kick_flush(struct request_queue *q, 
struct blk_flush_queue *fq)
 * Borrow tag from the first request since they can't
 * be in flight at the same time. And acquire the tag's
 * ownership for flush req.
+*
+* In case of io scheduler, flush rq need to borrow
+* scheduler tag for making put/get driver tag workable.
+* In case of none, flush rq need to borrow driver tag.
 */
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
 
flush_rq->mq_ctx = first_rq->mq_ctx;
-   flush_rq->tag = first_rq->tag;
-   fq->orig_rq = first_rq;
 
-   hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
-   blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+   if (!q->elevator) {
+   fq->orig_rq = first_rq;
+   flush_rq->tag = first_rq->tag;
+   hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+   blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+   } else {
+   flush_rq->internal_tag = first_rq->internal_tag;
+   }
}
 
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +418,11 @@ static void mq_flush_data_end_io(struct request *rq, 
blk_status_t error)
 
hctx = blk_mq_map_queue(q, ctx->cpu);
 
+   if (q->elevator) {
+   WARN_ON(rq->tag < 0);
+   put_flush_driver_tag(hctx, rq);
+   }
+
/*
 * After populating an empty queue, kick it to avoid stall.  Read
 * the comment in flush_end_io().
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index fa0f33d7efb9..c2ea8c2b896b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -341,21 +341,6 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
}
 }
 
-/*
- * Add flush/fua to the queue. If we fail getting a driver tag, then
- * punt to the requeue list. Requeue will re-invoke us from a context
- * that's safe to block from.
- */
-static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
- struct request *rq, bool can_block)
-{
-   if (blk_mq_get_driver_tag(rq, , can_block)) {
-   blk_insert_flush(rq);
-

[PATCH V2 7/7] blk-mq-sched: warning on inserting a req with driver tag allocated

2017-09-15 Thread Ming Lei

In case of IO scheduler, any request shouldn't have a tag
assigned before dispatching, so add the warning to monitor
possible bug.

Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c2ea8c2b896b..b6934a9424ff 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -271,10 +271,8 @@ static bool blk_mq_sched_bypass_insert(struct 
blk_mq_hw_ctx *hctx,
return true;
}
 
-   if (has_sched) {
+   if (has_sched)
rq->rq_flags |= RQF_SORTED;
-   WARN_ON(rq->tag != -1);
-   }
 
return false;
 }
@@ -355,6 +353,8 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
goto run;
}
 
+   WARN_ON(e && (rq->tag != -1));
+
if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run;
 
-- 
2.9.5

[PATCH V2 5/7] blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h

2017-09-15 Thread Ming Lei

We need this helper to put the tag for flush rq, since
we will not share tag in the flush request sequences
in case of I/O scheduler.

Also the driver tag need to be released before requeuing.

Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 32 
 block/blk-mq.h | 33 +
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 21331e785919..b799d4e22c0b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -920,38 +920,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct 
blk_mq_hw_ctx **hctx,
return rq->tag != -1;
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-   struct request *rq)
-{
-   blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-   rq->tag = -1;
-
-   if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-   rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-   atomic_dec(>nr_active);
-   }
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-  struct request *rq)
-{
-   if (rq->tag == -1 || rq->internal_tag == -1)
-   return;
-
-   __blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-   struct blk_mq_hw_ctx *hctx;
-
-   if (rq->tag == -1 || rq->internal_tag == -1)
-   return;
-
-   hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-   __blk_mq_put_driver_tag(hctx, rq);
-}
-
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 038bdeb84629..2bef2d0cca7b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -2,6 +2,7 @@
 #define INT_BLK_MQ_H
 
 #include "blk-stat.h"
+#include "blk-mq-tag.h"
 
 struct blk_mq_tag_set;
 
@@ -137,4 +138,36 @@ static inline bool blk_mq_hw_queue_mapped(struct 
blk_mq_hw_ctx *hctx)
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2]);
 
+static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+  struct request *rq)
+{
+   blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+   rq->tag = -1;
+
+   if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+   rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+   atomic_dec(>nr_active);
+   }
+}
+
+static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
+  struct request *rq)
+{
+   if (rq->tag == -1 || rq->internal_tag == -1)
+   return;
+
+   __blk_mq_put_driver_tag(hctx, rq);
+}
+
+static inline void blk_mq_put_driver_tag(struct request *rq)
+{
+   struct blk_mq_hw_ctx *hctx;
+
+   if (rq->tag == -1 || rq->internal_tag == -1)
+   return;
+
+   hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+   __blk_mq_put_driver_tag(hctx, rq);
+}
+
 #endif
-- 
2.9.5

[PATCH V2 4/7] blk-mq: decide how to handle flush rq via RQF_FLUSH_SEQ

2017-09-15 Thread Ming Lei

Now we always preallocate one driver tag before blk_insert_flush(),
and flush request will be marked as RQF_FLUSH_SEQ once it is in
flush machinary.

So if RQF_FLUSH_SEQ isn't set, we call blk_insert_flush()
to handle the request, otherwise the flush request is
dispatched to ->dispatch list directly.

This is a prepare patch for not preallocating driver tag
on flush rq, and remove the special case.

Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..fa0f33d7efb9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -260,21 +260,23 @@ void blk_mq_sched_request_inserted(struct request *rq)
 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 
 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+  bool has_sched,
   struct request *rq)
 {
-   if (rq->tag == -1) {
+   /* dispatch flush rq directly */
+   if (rq->rq_flags & RQF_FLUSH_SEQ) {
+   spin_lock(>lock);
+   list_add(>queuelist, >dispatch);
+   spin_unlock(>lock);
+   return true;
+   }
+
+   if (has_sched) {
rq->rq_flags |= RQF_SORTED;
-   return false;
+   WARN_ON(rq->tag != -1);
}
 
-   /*
-* If we already have a real request tag, send directly to
-* the dispatch list.
-*/
-   spin_lock(>lock);
-   list_add(>queuelist, >dispatch);
-   spin_unlock(>lock);
-   return true;
+   return false;
 }
 
 /**
@@ -362,12 +364,13 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
-   if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+   /* flush rq in flush machinery need to be dispatched directly */
+   if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
blk_mq_sched_insert_flush(hctx, rq, can_block);
return;
}
 
-   if (e && blk_mq_sched_bypass_insert(hctx, rq))
+   if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run;
 
if (e && e->type->ops.mq.insert_requests) {
@@ -405,7 +408,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
list_for_each_entry_safe(rq, next, list, queuelist) {
if (WARN_ON_ONCE(rq->tag != -1)) {
list_del_init(>queuelist);
-   blk_mq_sched_bypass_insert(hctx, rq);
+   blk_mq_sched_bypass_insert(hctx, true, rq);
}
}
}
-- 
2.9.5

[PATCH V2 2/7] block: pass 'run_queue' to blk_mq_request_bypass_insert

2017-09-15 Thread Ming Lei

Block flush need this function without needing run queue,
so introduce the parameter.

Signed-off-by: Ming Lei 
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +++--
 block/blk-mq.h   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index aebe676225e6..bb9ec0013582 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2347,7 +2347,7 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   blk_mq_request_bypass_insert(rq);
+   blk_mq_request_bypass_insert(rq, true);
return BLK_STS_OK;
}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 98a18609755e..21331e785919 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1405,7 +1405,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
  * Should only be used carefully, when the caller knows we want to
  * bypass a potential IO scheduler on the target device.
  */
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
 {
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1414,7 +1414,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
list_add_tail(>queuelist, >dispatch);
spin_unlock(>lock);
 
-   blk_mq_run_hw_queue(hctx, false);
+   if (run_queue)
+   blk_mq_run_hw_queue(hctx, false);
 }
 
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ef15b3414da5..038bdeb84629 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -54,7 +54,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq);
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
 
-- 
2.9.5

[PATCH V2 3/7] blk-flush: use blk_mq_request_bypass_insert()

2017-09-15 Thread Ming Lei

In the following patch, we will use RQF_FLUSH_SEQ
to decide:

- if the flag isn't set, the flush rq need to be inserted
via blk_insert_flush()
- otherwise, the flush rq need to be dispatched directly
since it is in flush machinery now.

So we use blk_mq_request_bypass_insert() for requsts
of bypassing flush machinery, like what the legacy did.

Signed-off-by: Ming Lei 
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 81bd1a843043..a9773d2075ac 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops)
-   blk_mq_sched_insert_request(rq, false, false, false, 
false);
+   blk_mq_request_bypass_insert(rq, false);
else
list_add_tail(>queuelist, >queue_head);
return;
-- 
2.9.5

[PATCH V2 0/7] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Ming Lei

Hi,

This patchset avoids to allocate driver tag beforehand for flush rq
in case of I/O scheduler, then flush rq isn't treated specially
wrt. get/put driver tag, code gets cleanup much, such as,
reorder_tags_to_front() is removed, and we needn't to worry
about request order in dispatch list for avoiding I/O deadlock.

'dbench -t 30 -s -F 64' has been run on different devices(shared tag,
multi-queue, singele queue, ...), and no issues are observed,
even very low queue depth(1) test are run, debench still works
well.

Any comments are welcome!

V2:
- release driver tag before requeue
- warning on inserting a req with driver tag


Ming Lei (7):
  blk-flush: don't run queue for requests of bypassing flush
  block: pass 'run_queue' to blk_mq_request_bypass_insert
  blk-flush: use blk_mq_request_bypass_insert()
  blk-mq: decide how to handle flush rq via RQF_FLUSH_SEQ
  blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h
  blk-mq: don't allocate driver tag beforehand for flush rq
  blk-mq-sched: warning on inserting a req with driver tag allocated

 block/blk-core.c |  2 +-
 block/blk-flush.c| 43 +++-
 block/blk-mq-sched.c | 65 -
 block/blk-mq.c   | 81 +---
 block/blk-mq.h   | 35 ++-
 5 files changed, 102 insertions(+), 124 deletions(-)

-- 
2.9.5

[PATCH V2 0/7] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Ming Lei

Hi,

This patchset avoids to allocate driver tag beforehand for flush rq
in case of I/O scheduler, then flush rq isn't treated specially
wrt. get/put driver tag, code gets cleanup much, such as,
reorder_tags_to_front() is removed, and we needn't to worry
about request order in dispatch list for avoiding I/O deadlock.

'dbench -t 30 -s -F 64' has been run on different devices(shared tag,
multi-queue, singele queue, ...), and no issues are observed,
even very low queue depth(1) test are run, debench still works
well.

Any comments are welcome!

V2:
- release driver tag before requeue
- warning on inserting a req with driver tag


Ming Lei (7):
  blk-flush: don't run queue for requests of bypassing flush
  block: pass 'run_queue' to blk_mq_request_bypass_insert
  blk-flush: use blk_mq_request_bypass_insert()
  blk-mq: decide how to handle flush rq via RQF_FLUSH_SEQ
  blk-mq: move blk_mq_put_driver_tag*() into blk-mq.h
  blk-mq: don't allocate driver tag beforehand for flush rq
  blk-mq-sched: warning on inserting a req with driver tag allocated

 block/blk-core.c |  2 +-
 block/blk-flush.c| 43 +++-
 block/blk-mq-sched.c | 65 -
 block/blk-mq.c   | 81 +---
 block/blk-mq.h   | 35 ++-
 5 files changed, 102 insertions(+), 124 deletions(-)

-- 
2.9.5

[PATCH V2 1/7] blk-flush: don't run queue for requests of bypassing flush

2017-09-15 Thread Ming Lei

blk_insert_flush() should only insert request since run
queue always follows it.

For the case of bypassing flush, we don't need to run queue
since every blk_insert_flush() follows one run queue.

Signed-off-by: Ming Lei 
---
 block/blk-flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 4938bec8cfef..81bd1a843043 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -463,7 +463,7 @@ void blk_insert_flush(struct request *rq)
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops)
-   blk_mq_sched_insert_request(rq, false, true, false, 
false);
+   blk_mq_sched_insert_request(rq, false, false, false, 
false);
else
list_add_tail(>queuelist, >queue_head);
return;
-- 
2.9.5

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 07:35 +0900, Damien Le Moal wrote:
> rw16 is mandatory for ZBC drives. So it has to be set to true. If the
> HBA does not support rw16 (why would that happen ?), then the disk
> should not be used.

It's good that all HBAs support rw16. But it's nontrivial to analyze whether
or not use_1[06]_for_rw are set before these are used so maybe we should think
about how we can restructure the SCSI code such that it becomes easier to verify
that use_1[06]_for_rw are set before these are used.

Bart.

Re: [PATCH 0/6] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Ming Lei

On Fri, Sep 15, 2017 at 12:06:41PM -0600, Jens Axboe wrote:
> On 09/15/2017 08:29 AM, Jens Axboe wrote:
> > On 09/14/2017 08:20 PM, Ming Lei wrote:
> >> On Thu, Sep 14, 2017 at 12:51:24PM -0600, Jens Axboe wrote:
> >>> On 09/14/2017 10:42 AM, Ming Lei wrote:
>  Hi,
> 
>  This patchset avoids to allocate driver tag beforehand for flush rq
>  in case of I/O scheduler, then flush rq isn't treated specially
>  wrt. get/put driver tag, code gets cleanup much, such as,
>  reorder_tags_to_front() is removed, and we needn't to worry
>  about request order in dispatch list for avoiding I/O deadlock.
> 
>  'dbench -t 30 -s -F 64' has been run on different devices(shared tag,
>  multi-queue, singele queue, ...), and no issues are observed,
>  even very low queue depth(1) test are run, debench still works
>  well.
> >>>
> >>> Gave this a quick spin on the test box, and I get tons of spewage
> >>> on booting up:
> >>>
> >>> [9.131290] WARNING: CPU: 2 PID: 337 at block/blk-mq-sched.c:274 
> >>> blk_mq_sched_insert_request+0x15d/0x170
> >>
> >> Sorry, my fault.
> >>
> >> The WARN_ON() was inside 'if (has_sched)' actually, and could you
> >> please remove the WARN_ON() in blk_mq_sched_bypass_insert() and
> >> see if it works?
> > 
> > Putting it inside 'has_sched' makes it go away. I'll fire up some
> > testing of it here.
> 
> It still triggers for the requeue path, however:
> 
> [12403.280753] WARNING: CPU: 4 PID: 2279 at block/blk-mq-sched.c:275 
> blk_mq_sched_insert_request+0
> [12403.302465] Modules linked in: null_blk configfs crct10dif_generic 
> crct10dif_common loop dm_mo]
> [12403.331762] CPU: 4 PID: 2279 Comm: kworker/4:1H Tainted: GW   
> 4.13.0+ #473
> [12403.341497] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
> 11/09/2016
> [12403.350745] Workqueue: kblockd blk_mq_requeue_work
> [12403.356574] task: 881ff6c57000 task.stack: 881ff2acc000
> [12403.363667] RIP: 0010:blk_mq_sched_insert_request+0x15d/0x170
> [12403.370565] RSP: 0018:881ff2acfdc0 EFLAGS: 00010213
> [12403.376878] RAX: 881ff2868000 RBX: 880c7677be00 RCX: 
> 00022003
> [12403.385333] RDX: 881ff32d8c00 RSI: 0001 RDI: 
> 880c7677be00
> [12403.393790] RBP: 881ff2acfe08 R08: 0001 R09: 
> 8809010d4000
> [12403.402240] R10:  R11: 1000 R12: 
> 881ff37b3800
> [12403.410695] R13:  R14:  R15: 
> e8dfffe86940
> [12403.419150] FS:  () GS:881fff68() 
> knlGS:
> [12403.429065] CS:  0010 DS:  ES:  CR0: 80050033
> [12403.435961] CR2: 7f13f4ce1000 CR3: 001ff357d002 CR4: 
> 003606e0
> [12403.14] DR0:  DR1:  DR2: 
> 
> [12403.452868] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [12403.461316] Call Trace:
> [12403.464528]  ? __blk_mq_delay_run_hw_queue+0x84/0xa0
> [12403.470550]  blk_mq_requeue_work+0xc6/0x140
> [12403.475702]  process_one_work+0x18a/0x3e0
> [12403.480654]  worker_thread+0x48/0x3b0
> [12403.485313]  kthread+0x12a/0x140
> [12403.489390]  ? process_one_work+0x3e0/0x3e0
> [12403.494545]  ? kthread_create_on_node+0x40/0x40
> [12403.500078]  ret_from_fork+0x22/0x30
> [12403.504551] Code: c0 4c 89 fa e8 e5 97 02 00 48 8b 4d c0 84 c0 74 10 49 89 
> 5f 08 4c 89 3b 48 8 
> [12403.526954] ---[ end trace 2eefb804292867b5 ]---
> 
> The code now looks like this:
> 
> if (has_sched) {
>   WARN_ON(rq->tag != -1); 
>   rq->rq_flags |= RQF_SORTED; 
> }

Yeah, requeue is one case I missed, since the request may
have a valid driver tag assigned at that time if the requeue
happens in completion path.

So I think we need to release driver tag in __blk_mq_requeue_request().

-- 
Ming

Re: [PATCH V3 05/12] scsi: sd_zbc: Fix comments and indentation

2017-09-15 Thread Damien Le Moal

On 9/15/17 19:44, Hannes Reinecke wrote:
> On 09/15/2017 12:06 PM, Damien Le Moal wrote:
>> Fix comments style (do not use documented comment style) and add some
>> comments to clarify some functions. Also fix some functions signature
>> indentation and remove a useless blank line in sd_zbc_read_zones().
>>
>> No functional change is introduced by this patch.
>>
>> Signed-off-by: Damien Le Moal 
>> ---
>>  drivers/scsi/scsi_lib.c |  5 +++-
>>  drivers/scsi/sd_zbc.c   | 67 
>> -
>>  2 files changed, 53 insertions(+), 19 deletions(-)
>>
> Actually you should've used kernel-doc style to format the comments;
> just to keep the 0-day bot happy

I thought that comments to functions with the kernel-doc format was only
if you want the function to be documented. That generally includes at
least all the exported functions.
In this case, the functions are mostly local to sd_zbc.c... I thought
the documentation of those was not very useful.

I can revert though. No problem at all.

Thanks !

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH V3 01/12] block: Fix declaration of blk-mq debugfs functions

2017-09-15 Thread Damien Le Moal

On 9/16/17 02:45, Christoph Hellwig wrote:
> On Fri, Sep 15, 2017 at 07:06:34PM +0900, Damien Le Moal wrote:
>> __blk_mq_debugfs_rq_show() and blk_mq_debugfs_rq_show() are exported
>> symbols but ar eonly declared in the block internal file
>> block/blk-mq-debugfs.h. which is not cleanly accessible to files outside
>> of the block directory.
>> Move the declaration of these functions to the new file
>> include/linux/blk-mq-debugfs.h file to make the declarations cleanly
>> available to other modules.
>>
>> While at it, also move the definition of the blk_mq_debugfs_attr
>> structure to allow scheduler modules outside of the block directory to
>> define debugfs attributes.
> 
> I think we should keep them local to block/ where all I/O schedulers
> live, so: NAK.

OK. I will drop this one and 02/12.

Thanks.

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Damien Le Moal

On 9/16/17 06:02, Bart Van Assche wrote:
> On Fri, 2017-09-15 at 19:51 +0200, h...@lst.de wrote:
>> On Fri, Sep 15, 2017 at 02:51:03PM +, Bart Van Assche wrote:
>>> On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
 Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
 assignments and move the calculation of sdkp->zone_shift together
 with the assignment of the verified zone_blocks value in
 sd_zbc_check_zone_size().
>>>
>>> Both functions are called from inside 
>>> block_device_operations.revalidate_disk.
>>> Isn't that too late to set use_1[06]_for_rw? Shouldn't both be set just 
>>> after
>>> the slave_alloc callback has been called?
>>
>> Hmm.  sd_read_capacity already is called from the same path and seems
>> to work..
> 
> Hello Christoph,
> 
> My concern is that if both a SCSI LLD and the ZBC code set use_1[06]_for_rw
> that these settings may conflict. Should the ZBC code e.g. refuse that the sd
> driver is attached to a ZBC disk controlled by a SCSI LLD that supports
> use_10_for_rw but not use_16_for_rw?

rw16 is mandatory for ZBC drives. So it has to be set to true. If the
HBA does not support rw16 (why would that happen ?), then the disk
should not be used.

Cheers.

-- 
Damien Le Moal
Western Digital Research

Re: [PATCH 5/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> +static void save_path_queue_depth(struct pgpath *p)
> +{
> + struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
> +
> + p->old_nr_requests = q->nr_requests;
> + p->queue_depth = q->queue_depth;
> +
> + /* one extra request for making the pipeline full */
> + if (p->queue_depth)
> + blk_update_nr_requests(q, p->queue_depth + 1);
> +}

blk_mq_init_allocated_queue() initializes nr_requests to the tag set queue 
depth.
Does that mean that the above code increases nr_requests by one? If so, does
that change only result in a performance improvement for the queue depth
mentioned in the path description (3)? Sorry but I doubt that this change will
yield a significant improvement for higher queue depths. Does that mean that
this patch can be left out?

Thanks,

Bart.

Re: [PATCH 5/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> 1) lpfc.lpfc_lun_queue_depth=3, so that it is same with .cmd_per_lun

Nobody I know uses such a low queue depth for the lpfc driver. Please also
include performance results for a more realistic queue depth.

Thanks,

Bart.

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:51 +0200, h...@lst.de wrote:
> On Fri, Sep 15, 2017 at 02:51:03PM +, Bart Van Assche wrote:
> > On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> > > Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
> > > assignments and move the calculation of sdkp->zone_shift together
> > > with the assignment of the verified zone_blocks value in
> > > sd_zbc_check_zone_size().
> > 
> > Both functions are called from inside 
> > block_device_operations.revalidate_disk.
> > Isn't that too late to set use_1[06]_for_rw? Shouldn't both be set just 
> > after
> > the slave_alloc callback has been called?
> 
> Hmm.  sd_read_capacity already is called from the same path and seems
> to work..

Hello Christoph,

My concern is that if both a SCSI LLD and the ZBC code set use_1[06]_for_rw
that these settings may conflict. Should the ZBC code e.g. refuse that the sd
driver is attached to a ZBC disk controlled by a SCSI LLD that supports
use_10_for_rw but not use_16_for_rw?

Thanks,

Bart.

Re: [PATCH 5/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> ---
>   |v4.13+ |v4.13+
>   |+scsi_mq_perf  |+scsi_mq_perf+patches
> -
>  IOPS(K)  |MQ-DEADLINE|MQ-DEADLINE
> --
> read  |   30.71   |  343.91
> -
> randread  |   22.98   |   17.17
> --
> write |   16.45   |  390.88
> --
> randwrite |   16.21   |   16.09
> ---

What is the reason for the random I/O performance regressions? Users whose
workload triggers a random I/O pattern will be really unhappy with a 33% IOPS
regression for random reads. Does that regression get worse for lower latency
transports, e.g. SRP?

Bart.

Re: [PATCH 2/5] dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 16:06 -0400, Mike Snitzer wrote:
> The problem is that multipath_clone_and_map() is now treated as common
> code (thanks to both blk-mq and old .request_fn now enjoying the use of
> blk_get_request) BUT: Ming please understand that this code is used by
> old .request_fn too.  So it would seem that the use of
> DM_MAPIO_DELAY_REQUEUE vs DM_MAPIO_REQUEUE needs to be based on dm-sq vs
> dm-mq.

Hello Mike,

My proposal is to leave out patches 1 and 2 entirely. Since the SCSI core
calls blk_mq_run_hw_queues() anyway after a request has finished it is not
clear to me what the motivation was behind the development of patches 1 and
2 in this series. If the goal was to rerun a queue after a request has
finished I think the same approach should be taken for dm as for the SCSI
core, namely to run the queue from inside the end_io callback.

Bart.

Re: [PATCH 5/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Mike Snitzer

On Fri, Sep 15 2017 at 12:44pm -0400,
Ming Lei  wrote:

> The actual I/O schedule is done in dm-mpath layer, and the
> underlying I/O schedule is simply bypassed.
> 
> This patch sets underlying queue's nr_requests as its queue's
> queue_depth, then  we can get its queue busy feedback by simply
> checking if blk_get_request() returns successfully.
> 
> In this way, dm-mpath can reports its queue busy to block layer
> effectively, so I/O scheduling is improved much.
> 
> Follows the test results on lpfc*:
> 
> - fio(libaio, bs:4k, dio, queue_depth:64, 64 jobs,
>   over dm-mpath disk)
> - system(12 cores, dual sockets, mem: 64G)
> 
> ---
>   |v4.13+ |v4.13+
>   |+scsi_mq_perf  |+scsi_mq_perf+patches
> -
>  IOPS(K)  |MQ-DEADLINE|MQ-DEADLINE
> --
> read  |   30.71   |  343.91
> -
> randread  |   22.98   |   17.17
> --
> write |   16.45   |  390.88
> --
> randwrite |   16.21   |   16.09
> ---
> 
> *:
> 1) lpfc.lpfc_lun_queue_depth=3, so that it is same with .cmd_per_lun
> 2) scsi_mq_perf means the patchset of 'blk-mq-sched: improve SCSI-MQ 
> performance(V4)'[1]
> 3) v4.13+: top commit is 46c1e79fee41 Merge branch 'perf-urgent-for-linus'
> 4) the patchset 'blk-mq-sched: improve SCSI-MQ performance(V4)' focuses
> on improving on SCSI-MQ, and all the test result in that coverletter was
> against the raw lpfc/ib(run after 'multipath -F'), instead of dm-mpath.
> 5) this patchset itself doesn't depend on the scsi_mq_perf patchset[1]
> 
> [1] https://marc.info/?t=15043655572=1=2
> 
> Signed-off-by: Ming Lei 

Very impressive gains Ming!

I'll review in more detail next week but Bart's concerns on patch 1 and
2 definitely need consideration.

Thanks,
Mike

Re: [PATCH 2/5] dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure

2017-09-15 Thread Mike Snitzer

On Fri, Sep 15 2017 at  1:29pm -0400,
Bart Van Assche  wrote:

> On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> > blk-mq will rerun queue via RESTART after one request is completion,
> > so not necessary to wait random time for requeuing, it should trust
> > blk-mq to do it.
> > 
> > Signed-off-by: Ming Lei 
> > ---
> >  drivers/md/dm-mpath.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
> > index 96aedaac2c64..f5a1088a6e79 100644
> > --- a/drivers/md/dm-mpath.c
> > +++ b/drivers/md/dm-mpath.c
> > @@ -505,7 +505,7 @@ static int multipath_clone_and_map(struct dm_target 
> > *ti, struct request *rq,
> > atomic_inc(>pg_init_in_progress);
> > activate_or_offline_path(pgpath);
> > }
> > -   return DM_MAPIO_DELAY_REQUEUE;
> > +   return DM_MAPIO_REQUEUE;
> > }
> > clone->bio = clone->biotail = NULL;
> > clone->rq_disk = bdev->bd_disk;
> 
> So you are reverting the patch below? Thank you very much.
> 
> commit 1c23484c355ec360ca2f37914f8a4802c6baeead
> Author: Bart Van Assche 
> Date:   Wed Aug 9 11:32:12 2017 -0700
> 
> dm mpath: do not lock up a CPU with requeuing activity
> 
> When using the block layer in single queue mode, get_request()
> returns ERR_PTR(-EAGAIN) if the queue is dying and the REQ_NOWAIT
> flag has been passed to get_request(). Avoid that the kernel
> reports soft lockup complaints in this case due to continuous
> requeuing activity.
> 
> Fixes: 7083abbbf ("dm mpath: avoid that path removal can trigger an 
> infinite loop")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Bart Van Assche 
> Tested-by: Laurence Oberman 
> Reviewed-by: Christoph Hellwig 
> Signed-off-by: Mike Snitzer 

The problem is that multipath_clone_and_map() is now treated as common
code (thanks to both blk-mq and old .request_fn now enjoying the use of
blk_get_request) BUT: Ming please understand that this code is used by
old .request_fn too.  So it would seem that the use of
DM_MAPIO_DELAY_REQUEUE vs DM_MAPIO_REQUEUE needs to be based on dm-sq vs
dm-mq.

Re: [PATCH 00/10] nvme multipath support on top of nvme-4.13 branch

2017-09-15 Thread Christoph Hellwig

Hi Anish,

I looked over the code a bit, and I'm rather confused by the newly
added commands.  Which controller supports them?  Also the NVMe
working group went down a very different way with the ALUA approch,
which uses different grouping concepts and doesn't require path
activations - for Linux we'd really like to stick to only standardized
multipath implementations.

Re: [PATCH 0/6] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Jens Axboe

On 09/15/2017 08:29 AM, Jens Axboe wrote:
> On 09/14/2017 08:20 PM, Ming Lei wrote:
>> On Thu, Sep 14, 2017 at 12:51:24PM -0600, Jens Axboe wrote:
>>> On 09/14/2017 10:42 AM, Ming Lei wrote:
 Hi,

 This patchset avoids to allocate driver tag beforehand for flush rq
 in case of I/O scheduler, then flush rq isn't treated specially
 wrt. get/put driver tag, code gets cleanup much, such as,
 reorder_tags_to_front() is removed, and we needn't to worry
 about request order in dispatch list for avoiding I/O deadlock.

 'dbench -t 30 -s -F 64' has been run on different devices(shared tag,
 multi-queue, singele queue, ...), and no issues are observed,
 even very low queue depth(1) test are run, debench still works
 well.
>>>
>>> Gave this a quick spin on the test box, and I get tons of spewage
>>> on booting up:
>>>
>>> [9.131290] WARNING: CPU: 2 PID: 337 at block/blk-mq-sched.c:274 
>>> blk_mq_sched_insert_request+0x15d/0x170
>>
>> Sorry, my fault.
>>
>> The WARN_ON() was inside 'if (has_sched)' actually, and could you
>> please remove the WARN_ON() in blk_mq_sched_bypass_insert() and
>> see if it works?
> 
> Putting it inside 'has_sched' makes it go away. I'll fire up some
> testing of it here.

It still triggers for the requeue path, however:

[12403.280753] WARNING: CPU: 4 PID: 2279 at block/blk-mq-sched.c:275 
blk_mq_sched_insert_request+0
[12403.302465] Modules linked in: null_blk configfs crct10dif_generic 
crct10dif_common loop dm_mo]
[12403.331762] CPU: 4 PID: 2279 Comm: kworker/4:1H Tainted: GW   
4.13.0+ #473
[12403.341497] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
11/09/2016
[12403.350745] Workqueue: kblockd blk_mq_requeue_work
[12403.356574] task: 881ff6c57000 task.stack: 881ff2acc000
[12403.363667] RIP: 0010:blk_mq_sched_insert_request+0x15d/0x170
[12403.370565] RSP: 0018:881ff2acfdc0 EFLAGS: 00010213
[12403.376878] RAX: 881ff2868000 RBX: 880c7677be00 RCX: 00022003
[12403.385333] RDX: 881ff32d8c00 RSI: 0001 RDI: 880c7677be00
[12403.393790] RBP: 881ff2acfe08 R08: 0001 R09: 8809010d4000
[12403.402240] R10:  R11: 1000 R12: 881ff37b3800
[12403.410695] R13:  R14:  R15: e8dfffe86940
[12403.419150] FS:  () GS:881fff68() 
knlGS:
[12403.429065] CS:  0010 DS:  ES:  CR0: 80050033
[12403.435961] CR2: 7f13f4ce1000 CR3: 001ff357d002 CR4: 003606e0
[12403.14] DR0:  DR1:  DR2: 
[12403.452868] DR3:  DR6: fffe0ff0 DR7: 0400
[12403.461316] Call Trace:
[12403.464528]  ? __blk_mq_delay_run_hw_queue+0x84/0xa0
[12403.470550]  blk_mq_requeue_work+0xc6/0x140
[12403.475702]  process_one_work+0x18a/0x3e0
[12403.480654]  worker_thread+0x48/0x3b0
[12403.485313]  kthread+0x12a/0x140
[12403.489390]  ? process_one_work+0x3e0/0x3e0
[12403.494545]  ? kthread_create_on_node+0x40/0x40
[12403.500078]  ret_from_fork+0x22/0x30
[12403.504551] Code: c0 4c 89 fa e8 e5 97 02 00 48 8b 4d c0 84 c0 74 10 49 89 
5f 08 4c 89 3b 48 8 
[12403.526954] ---[ end trace 2eefb804292867b5 ]---

The code now looks like this:

if (has_sched) {
WARN_ON(rq->tag != -1); 
rq->rq_flags |= RQF_SORTED; 
}

-- 
Jens Axboe

Re: [PATCH 1/5] block: don't call blk_mq_delay_run_hw_queue() in case of BLK_STS_RESOURCE

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> If .queue_rq() returns BLK_STS_RESOURCE, blk-mq will rerun
> the queue in the three situations:
> 
> 1) if BLK_MQ_S_SCHED_RESTART is set
> - queue is rerun after one rq is completed, see blk_mq_sched_restart()
> which is run from blk_mq_free_request()
> 
> 2) BLK_MQ_S_TAG_WAITING is set
> - queue is rerun after one tag is freed
> 
> 3) otherwise
> - queue is run immediately in blk_mq_dispatch_rq_list()
> 
> So calling blk_mq_delay_run_hw_queue() inside .queue_rq() doesn't make
> sense because no matter it is called or not, the queue still will be
> rerun soon in above three situations, and the busy req can be dispatched
> again.

NAK

Block drivers call blk_mq_delay_run_hw_queue() if it is known that the queue
has to be rerun later even if no request has completed before the delay has
expired. This patch breaks at least the SCSI core and the dm-mpath drivers.

Bart.

Re: [PATCH V3 08/12] scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

2017-09-15 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH V3 07/12] scsi: sd_zbc: Use well defined macros

2017-09-15 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread h...@lst.de

On Fri, Sep 15, 2017 at 02:51:03PM +, Bart Van Assche wrote:
> On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> > Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
> > assignments and move the calculation of sdkp->zone_shift together
> > with the assignment of the verified zone_blocks value in
> > sd_zbc_check_zone_size().
> 
> Both functions are called from inside block_device_operations.revalidate_disk.
> Isn't that too late to set use_1[06]_for_rw? Shouldn't both be set just after
> the slave_alloc callback has been called?

Hmm.  sd_read_capacity already is called from the same path and seems
to work..

> 
> Thanks,
> 
> Bart.---end quoted text---

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH V3 04/12] scsi: sd_zbc: Move ZBC declarations to scsi_proto.h

2017-09-15 Thread Christoph Hellwig

Looks fine,

Reviewed-by: Christoph Hellwig

Re: [PATCH V3 03/12] block: Add zoned block device information to request queue

2017-09-15 Thread Christoph Hellwig

> +struct blk_zoned {
> + unsigned intnr_zones;
> + unsigned long   *seq_zones;
> +};
> +
>  struct blk_zone_report_hdr {
>   unsigned intnr_zones;
>   u8  padding[60];
> @@ -492,6 +497,10 @@ struct request_queue {
>   struct blk_integrity integrity;
>  #endif   /* CONFIG_BLK_DEV_INTEGRITY */
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> + struct blk_zonedzoned;
> +#endif

I'd prefer to just add the two fields direct to struct request_queue
instead of the container structure.

> +static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
> +{
> + return blk_queue_is_zoned(q) ? q->zoned.nr_zones : 0;
> +}

I don't think this is going to compile without CONFIG_BLK_DEV_ZONED,
we'd either need two versions of it, or just always add nr_zones to
the request queue.

With nr_zones always present we could then remove the first check,
too as it would always be initialized to zero.

Re: [PATCH V3 02/12] block: Fix declaration of blk-mq scheduler functions

2017-09-15 Thread Christoph Hellwig

Same as for patch 1: this should stay local to block/ - we don't
want random drivers to grow I/O schedulers.

Re: [PATCH V3 01/12] block: Fix declaration of blk-mq debugfs functions

2017-09-15 Thread Christoph Hellwig

On Fri, Sep 15, 2017 at 07:06:34PM +0900, Damien Le Moal wrote:
> __blk_mq_debugfs_rq_show() and blk_mq_debugfs_rq_show() are exported
> symbols but ar eonly declared in the block internal file
> block/blk-mq-debugfs.h. which is not cleanly accessible to files outside
> of the block directory.
> Move the declaration of these functions to the new file
> include/linux/blk-mq-debugfs.h file to make the declarations cleanly
> available to other modules.
> 
> While at it, also move the definition of the blk_mq_debugfs_attr
> structure to allow scheduler modules outside of the block directory to
> define debugfs attributes.

I think we should keep them local to block/ where all I/O schedulers
live, so: NAK.

Re: [PATCH 2/5] dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure

2017-09-15 Thread Bart Van Assche

On Sat, 2017-09-16 at 00:44 +0800, Ming Lei wrote:
> blk-mq will rerun queue via RESTART after one request is completion,
> so not necessary to wait random time for requeuing, it should trust
> blk-mq to do it.
> 
> Signed-off-by: Ming Lei 
> ---
>  drivers/md/dm-mpath.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
> index 96aedaac2c64..f5a1088a6e79 100644
> --- a/drivers/md/dm-mpath.c
> +++ b/drivers/md/dm-mpath.c
> @@ -505,7 +505,7 @@ static int multipath_clone_and_map(struct dm_target *ti, 
> struct request *rq,
>   atomic_inc(>pg_init_in_progress);
>   activate_or_offline_path(pgpath);
>   }
> - return DM_MAPIO_DELAY_REQUEUE;
> + return DM_MAPIO_REQUEUE;
>   }
>   clone->bio = clone->biotail = NULL;
>   clone->rq_disk = bdev->bd_disk;

So you are reverting the patch below? Thank you very much.

commit 1c23484c355ec360ca2f37914f8a4802c6baeead
Author: Bart Van Assche 
Date:   Wed Aug 9 11:32:12 2017 -0700

dm mpath: do not lock up a CPU with requeuing activity

When using the block layer in single queue mode, get_request()
returns ERR_PTR(-EAGAIN) if the queue is dying and the REQ_NOWAIT
flag has been passed to get_request(). Avoid that the kernel
reports soft lockup complaints in this case due to continuous
requeuing activity.

Fixes: 7083abbbf ("dm mpath: avoid that path removal can trigger an 
infinite loop")
Cc: sta...@vger.kernel.org
Signed-off-by: Bart Van Assche 
Tested-by: Laurence Oberman 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Mike Snitzer

[PATCH 3/5] dm-mpath: remove annoying message of 'blk_get_request() returned -11'

2017-09-15 Thread Ming Lei

It is very normal to see allocation failure, so not necessary
to dump it and annoy people.

Signed-off-by: Ming Lei 
---
 drivers/md/dm-mpath.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f5a1088a6e79..f57ad8621c4c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -499,8 +499,6 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
bool queue_dying = blk_queue_dying(q);
-   DMERR_LIMIT("blk_get_request() returned %ld%s - requeuing",
-   PTR_ERR(clone), queue_dying ? " (path offline)" : 
"");
if (queue_dying) {
atomic_inc(>pg_init_in_progress);
activate_or_offline_path(pgpath);
-- 
2.9.5

[PATCH 5/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Ming Lei

The actual I/O schedule is done in dm-mpath layer, and the
underlying I/O schedule is simply bypassed.

This patch sets underlying queue's nr_requests as its queue's
queue_depth, then  we can get its queue busy feedback by simply
checking if blk_get_request() returns successfully.

In this way, dm-mpath can reports its queue busy to block layer
effectively, so I/O scheduling is improved much.

Follows the test results on lpfc*:

- fio(libaio, bs:4k, dio, queue_depth:64, 64 jobs,
  over dm-mpath disk)
- system(12 cores, dual sockets, mem: 64G)

---
  |v4.13+ |v4.13+
  |+scsi_mq_perf  |+scsi_mq_perf+patches
-
 IOPS(K)  |MQ-DEADLINE|MQ-DEADLINE
--
read  |   30.71   |  343.91
-
randread  |   22.98   |   17.17
--
write |   16.45   |  390.88
--
randwrite |   16.21   |   16.09
---

*:
1) lpfc.lpfc_lun_queue_depth=3, so that it is same with .cmd_per_lun
2) scsi_mq_perf means the patchset of 'blk-mq-sched: improve SCSI-MQ 
performance(V4)'[1]
3) v4.13+: top commit is 46c1e79fee41 Merge branch 'perf-urgent-for-linus'
4) the patchset 'blk-mq-sched: improve SCSI-MQ performance(V4)' focuses
on improving on SCSI-MQ, and all the test result in that coverletter was
against the raw lpfc/ib(run after 'multipath -F'), instead of dm-mpath.
5) this patchset itself doesn't depend on the scsi_mq_perf patchset[1]

[1] https://marc.info/?t=15043655572=1=2

Signed-off-by: Ming Lei 
---
 drivers/md/dm-mpath.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f57ad8621c4c..02647386d2d9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -39,6 +39,8 @@ struct pgpath {
 
struct dm_path path;
struct delayed_work activate_path;
+   unsigned old_nr_requests;
+   unsigned queue_depth;
 
bool is_active:1;   /* Path status */
 };
@@ -160,12 +162,34 @@ static struct priority_group *alloc_priority_group(void)
return pg;
 }
 
+static void save_path_queue_depth(struct pgpath *p)
+{
+   struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
+
+   p->old_nr_requests = q->nr_requests;
+   p->queue_depth = q->queue_depth;
+
+   /* one extra request for making the pipeline full */
+   if (p->queue_depth)
+   blk_update_nr_requests(q, p->queue_depth + 1);
+}
+
+static void restore_path_queue_depth(struct pgpath *p)
+{
+   struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
+
+   /* nr->requests isn't changed, we restore to old value */
+   if (q->nr_requests == p->queue_depth + 1)
+   blk_update_nr_requests(q, p->old_nr_requests);
+}
+
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
struct pgpath *pgpath, *tmp;
 
list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
list_del(>list);
+   restore_path_queue_depth(pgpath);
dm_put_device(ti, pgpath->path.dev);
free_pgpath(pgpath);
}
@@ -810,6 +834,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, 
struct path_selector *ps
goto bad;
}
 
+   save_path_queue_depth(p);
+
return p;
 
  bad:
-- 
2.9.5

[PATCH 4/5] block: export blk_update_nr_requests

2017-09-15 Thread Ming Lei

dm-mpath need this API for improving IO scheduling.

The IO schedule is actually done on dm-rq(mpath) queue,
instead of underlying devices.

If we set q->nr_requests as q->queue_depth on underlying
devices, we can get the queue's busy feedback by simply
checking if blk_get_request() returns successfully.

This way will make block layer's I/O schedule more
effective on dm-rq device.

Signed-off-by: Ming Lei 
---
 block/blk-core.c   | 4 +++-
 block/blk-sysfs.c  | 5 +
 block/blk.h| 2 --
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index aebe676225e6..9752aac9821c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1108,7 +1108,8 @@ int blk_update_nr_requests(struct request_queue *q, 
unsigned int nr)
struct request_list *rl;
int on_thresh, off_thresh;
 
-   WARN_ON_ONCE(q->mq_ops);
+   if (q->mq_ops)
+   return blk_mq_update_nr_requests(q, nr);
 
spin_lock_irq(q->queue_lock);
q->nr_requests = nr;
@@ -1145,6 +1146,7 @@ int blk_update_nr_requests(struct request_queue *q, 
unsigned int nr)
spin_unlock_irq(q->queue_lock);
return 0;
 }
+EXPORT_SYMBOL(blk_update_nr_requests);
 
 /**
  * __get_request - get a free request
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b8362c0df51d..2dbef5dbd195 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -77,10 +77,7 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
 
-   if (q->request_fn)
-   err = blk_update_nr_requests(q, nr);
-   else
-   err = blk_mq_update_nr_requests(q, nr);
+   err = blk_update_nr_requests(q, nr);
 
if (err)
return err;
diff --git a/block/blk.h b/block/blk.h
index fa4f232afc18..5bf662da2417 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -308,8 +308,6 @@ static inline int queue_congestion_off_threshold(struct 
request_queue *q)
return q->nr_congestion_off;
 }
 
-extern int blk_update_nr_requests(struct request_queue *, unsigned int);
-
 /*
  * Contribute to IO statistics IFF:
  *
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 460294bb0fa5..226b97142999 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1229,6 +1229,7 @@ struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 extern void blk_set_queue_dying(struct request_queue *);
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
 
 /*
  * block layer runtime pm functions
-- 
2.9.5

[PATCH 2/5] dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure

2017-09-15 Thread Ming Lei

blk-mq will rerun queue via RESTART after one request is completion,
so not necessary to wait random time for requeuing, it should trust
blk-mq to do it.

Signed-off-by: Ming Lei 
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 96aedaac2c64..f5a1088a6e79 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -505,7 +505,7 @@ static int multipath_clone_and_map(struct dm_target *ti, 
struct request *rq,
atomic_inc(>pg_init_in_progress);
activate_or_offline_path(pgpath);
}
-   return DM_MAPIO_DELAY_REQUEUE;
+   return DM_MAPIO_REQUEUE;
}
clone->bio = clone->biotail = NULL;
clone->rq_disk = bdev->bd_disk;
-- 
2.9.5

[PATCH 0/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Ming Lei

Hi,

We depend on I/O scheduler in dm-mpath layer, and underlying
I/O scheduler is bypassed basically.

I/O scheduler depends on queue busy condition to
trigger I/O merge, unfortunatley inside dm-mpath,
the underlying queue busy feedback is not accurate
enough, and we just allocate one request and dispatcch
it out to underlying queue, no matter if that queue
is busy or not. Then I/O merge is hard to trigger.

This patchset sets underlying queue's nr_request as
the queue's queue depth, so that queue busy is figured
out by checking if request is allocated successfully.

>From test result on mq-deadline, sequential I/O performance
is improved a lot, see test result in patch 5's commit log.

Any comments are welcome!

Thanks,
Ming

Ming Lei (5):
  block: don't call blk_mq_delay_run_hw_queue() in case of
BLK_STS_RESOURCE
  dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure
  dm-mpath: remove annoying message of 'blk_get_request() returned -11'
  block: export blk_update_nr_requests
  dm-mpath: improve I/O schedule

 block/blk-core.c|  4 +++-
 block/blk-sysfs.c   |  5 +
 block/blk.h |  2 --
 drivers/md/dm-mpath.c   | 30 +++---
 drivers/md/dm-rq.c  |  1 -
 drivers/nvme/host/fc.c  |  3 ---
 drivers/scsi/scsi_lib.c |  4 
 include/linux/blkdev.h  |  1 +
 8 files changed, 32 insertions(+), 18 deletions(-)

-- 
2.9.5

[PATCH 0/5] dm-mpath: improve I/O schedule

2017-09-15 Thread Ming Lei

Hi,

We depend on I/O scheduler in dm-mpath layer, and underlying
I/O scheduler is bypassed basically.

I/O scheduler depends on queue busy condition to
trigger I/O merge, unfortunatley inside dm-mpath,
the underlying queue busy feedback is not accurate
enough, and we just allocate one request and dispatcch
it out to underlying queue, no matter if that queue
is busy or not. Then I/O merge is hard to trigger.

This patchset sets underlying queue's nr_request as
the queue's queue depth, so that queue busy is figured
out by checking if request is allocated successfully.

>From test result on mq-deadline, sequential I/O performance
is improved a lot, see test result in patch 5's commit log.

Any comments are welcome!

Thanks,
Ming

Ming Lei (5):
  block: don't call blk_mq_delay_run_hw_queue() in case of
BLK_STS_RESOURCE
  dm-mpath: return DM_MAPIO_REQUEUE in case of rq allocation failure
  dm-mpath: remove annoying message of 'blk_get_request() returned -11'
  block: export blk_update_nr_requests
  dm-mpath: improve I/O schedule

 block/blk-core.c|  4 +++-
 block/blk-sysfs.c   |  5 +
 block/blk.h |  2 --
 drivers/md/dm-mpath.c   | 30 +++---
 drivers/md/dm-rq.c  |  1 -
 drivers/nvme/host/fc.c  |  3 ---
 drivers/scsi/scsi_lib.c |  4 
 include/linux/blkdev.h  |  1 +
 8 files changed, 32 insertions(+), 18 deletions(-)

-- 
2.9.5

Re: [PATCH V3 07/12] scsi: sd_zbc: Use well defined macros

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> instead of open coding, use the min() macro to calculate a report zones
> reply buffer length in sd_zbc_check_zone_size() and the round_up()
> macro for calculating the number of zones in sd_zbc_setup().

Reviewed-by: Bart Van Assche

Re: [PATCH V3 10/12] scsi: sd_zbc: Limit zone write locking to sequential zones

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> +  * There is no write constraints on conventional zones. So any write
   ^^^
Should this have been "There are no"?

> - if (sdkp->zones_wlock &&
> - test_and_set_bit(zno, sdkp->zones_wlock))
> + if (q->zoned.seq_zones && test_bit(zno, q->zoned.seq_zones))
> + return BLKPREP_OK;
> + if (sdkp->zones_wlock && test_and_set_bit(zno, sdkp->zones_wlock))
>   return BLKPREP_DEFER;

Please introduce helper functions in include/linux/blkdev.h or in a new
header file for verifying whether or not a zone is a sequential zone and also
for locking a zone for writes. I expect that that will make the above code
significantly easier to read.

Thanks,

Bart.

Re: [PATCH V3 1/4] kthread: add a mechanism to store cgroup info

2017-09-15 Thread Tejun Heo

On Thu, Sep 14, 2017 at 02:02:04PM -0700, Shaohua Li wrote:
> From: Shaohua Li 
> 
> kthread usually runs jobs on behalf of other threads. The jobs should be
> charged to cgroup of original threads. But the jobs run in a kthread,
> where we lose the cgroup context of original threads. The patch adds a
> machanism to record cgroup info of original threads in kthread context.
> Later we can retrieve the cgroup info and attach the cgroup info to jobs.
> 
> Since this mechanism is only required by kthread, we store the cgroup
> info in kthread data instead of generic task_struct.
> 
> Signed-off-by: Shaohua Li 

Acked-by: Tejun Heo 

Thanks.

-- 
tejun

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
> assignments and move the calculation of sdkp->zone_shift together
> with the assignment of the verified zone_blocks value in
> sd_zbc_check_zone_size().

Both functions are called from inside block_device_operations.revalidate_disk.
Isn't that too late to set use_1[06]_for_rw? Shouldn't both be set just after
the slave_alloc callback has been called?

Thanks,

Bart.

Re: [PATCH V3 03/12] block: Add zoned block device information to request queue

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> @@ -492,6 +497,10 @@ struct request_queue {
>   struct blk_integrity integrity;
>  #endif   /* CONFIG_BLK_DEV_INTEGRITY */
>  
> +#ifdef CONFIG_BLK_DEV_ZONED
> + struct blk_zonedzoned;
> +#endif
> +
>  #ifdef CONFIG_PM
>   struct device   *dev;
>   int rpm_status;
> @@ -785,6 +794,11 @@ static inline unsigned int blk_queue_zone_sectors(struct 
> request_queue *q)
>   return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
>  }
>  
> +static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
> +{
> + return blk_queue_is_zoned(q) ? q->zoned.nr_zones : 0;
> +}

Does this code compile correctly if CONFIG_BLK_DEV_ZONED is disabled? Should
the definition of blk_queue_nr_zones() perhaps be surrounded by #ifdef
CONFIG_BLK_DEV_ZONED / #endif?

Thanks,

Bart.

Re: [PATCH V3 02/12] block: Fix declaration of blk-mq scheduler functions

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> The functions blk_mq_sched_free_hctx_data(), blk_mq_sched_try_merge(),
> blk_mq_sched_try_insert_merge() and blk_mq_sched_request_inserted() are
> all exported symbols but are declared only internally in
> block/blk-mq-sched.h. Move these declarations to the new file
> include/linux/blk-mq-sched.h to make them available to block scheduler
> modules implemented outside of the block directory.

Same comment here: should the title of this patch perhaps have been "Move
declarations of ..."?

> +#ifndef BLK_MQ_SCHED_H
> +#define BLK_MQ_SCHED_H
> +
> +/*
> + * Scheduler helper functions.
> + */
> +void blk_mq_sched_free_hctx_data(struct request_queue *q,
> +  void (*exit)(struct blk_mq_hw_ctx *));
> +void blk_mq_sched_request_inserted(struct request *rq);
> +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
> + struct request **merged_request);
> +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request 
> *rq);
> +
> +#endif

Please make sure that the order of #include directives does not affect the
compilation result, e.g. by adding forward declarations for the structures
used as arguments.

Thanks,

Bart.

Re: [PATCH V3 01/12] block: Fix declaration of blk-mq debugfs functions

2017-09-15 Thread Bart Van Assche

On Fri, 2017-09-15 at 19:06 +0900, Damien Le Moal wrote:
> __blk_mq_debugfs_rq_show() and blk_mq_debugfs_rq_show() are exported
> symbols but ar eonly declared in the block internal file
  
  are only?
> block/blk-mq-debugfs.h. which is not cleanly accessible to files outside
> of the block directory.
> Move the declaration of these functions to the new file
> include/linux/blk-mq-debugfs.h file to make the declarations cleanly
> available to other modules.
> 
> While at it, also move the definition of the blk_mq_debugfs_attr
> structure to allow scheduler modules outside of the block directory to
> define debugfs attributes.

The title of this patch is "Fix declaration of ...". Should the title perhaps
have been "Move declarations of ..."?

Anyway:

Reviewed-by: Bart Van Assche

Re: [PATCH 0/6] blk-mq: don't allocate driver tag beforehand for flush rq

2017-09-15 Thread Jens Axboe

On 09/14/2017 08:20 PM, Ming Lei wrote:
> On Thu, Sep 14, 2017 at 12:51:24PM -0600, Jens Axboe wrote:
>> On 09/14/2017 10:42 AM, Ming Lei wrote:
>>> Hi,
>>>
>>> This patchset avoids to allocate driver tag beforehand for flush rq
>>> in case of I/O scheduler, then flush rq isn't treated specially
>>> wrt. get/put driver tag, code gets cleanup much, such as,
>>> reorder_tags_to_front() is removed, and we needn't to worry
>>> about request order in dispatch list for avoiding I/O deadlock.
>>>
>>> 'dbench -t 30 -s -F 64' has been run on different devices(shared tag,
>>> multi-queue, singele queue, ...), and no issues are observed,
>>> even very low queue depth(1) test are run, debench still works
>>> well.
>>
>> Gave this a quick spin on the test box, and I get tons of spewage
>> on booting up:
>>
>> [9.131290] WARNING: CPU: 2 PID: 337 at block/blk-mq-sched.c:274 
>> blk_mq_sched_insert_request+0x15d/0x170
> 
> Sorry, my fault.
> 
> The WARN_ON() was inside 'if (has_sched)' actually, and could you
> please remove the WARN_ON() in blk_mq_sched_bypass_insert() and
> see if it works?

Putting it inside 'has_sched' makes it go away. I'll fire up some
testing of it here.

-- 
Jens Axboe

Re: [PATCH v2 0/2] Add wrapper for blkcg policy operatins

2017-09-15 Thread weiping zhang

On Fri, Sep 01, 2017 at 10:16:45PM +0800, weiping zhang wrote:
> The first patch is the V2 of [PATCH] blkcg: check pol->cpd_free_fn
> before free cpd, it fixs a checking before free cpd.
> 
> The second patch add some wrappers for struct blkcg_policy->xxx_fn, because 
> not
> every block cgroup policy implement all operations.
> 
> weiping zhang (2):
>   blkcg: check pol->cpd_free_fn before free cpd
>   blkcg: add wrappers for struct blkcg_policy operations
> 
>  block/blk-cgroup.c | 57 +---
>  include/linux/blk-cgroup.h | 72 
> ++
>  2 files changed, 99 insertions(+), 30 deletions(-)
> 
Hello Jens,

ping

[PATCH] lightnvm: pblk: remove redundant check on read path

2017-09-15 Thread Javier González

A partial read I/O in pblk is an I/O where some sectors reside in the
write buffer in main memory and some are persisted on the device. Such
an I/O must at least contain 2 lbas, therefore checking for the case
where a single lba is mapped is not necessary.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-read.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 402f8eff6a2e..71c58503f1a4 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -235,7 +235,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
rqd->end_io = pblk_end_io_sync;
rqd->private = 
 
-   if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+   if (unlikely(nr_holes == 1)) {
ppa_ptr = rqd->ppa_list;
dma_ppa_list = rqd->dma_ppa_list;
rqd->ppa_addr = rqd->ppa_list[0];
@@ -260,7 +260,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
 #endif
}
 
-   if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+   if (unlikely(nr_holes == 1)) {
struct ppa_addr ppa;
 
ppa = rqd->ppa_addr;
-- 
2.7.4

[PATCH] lightnvm: pblk: check lba sanity on read path

2017-09-15 Thread Javier González

As part of pblk's recovery scheme, we store the lba mapped to each
physical sector on the device's out-of-bound (OOB) area.

On the read path, we can use this information to validate that the data
being delivered to the upper layers corresponds to the lba being
requested. The cost of this check is an extra copy on the DMA region on
the device and an extra comparison in the host, given that (i) the OOB
area is being read together with the data in the media, and (ii) the DMA
region allocated for the ppa list can be reused for the metadata stored
on the OOB area.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-read.c | 51 +---
 drivers/lightnvm/pblk.h  |  4 +++-
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 0299fc08291d..a465d9980df4 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -41,6 +41,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio 
*bio,
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 sector_t blba, unsigned long *read_bitmap)
 {
+   struct pblk_sec_meta *meta_list = rqd->meta_list;
struct bio *bio = rqd->bio;
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
int nr_secs = rqd->nr_ppas;
@@ -56,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct 
nvm_rq *rqd,
 retry:
if (pblk_ppa_empty(p)) {
WARN_ON(test_and_set_bit(i, read_bitmap));
+   meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
 
if (unlikely(!advanced_bio)) {
bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -75,6 +77,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct 
nvm_rq *rqd,
goto retry;
}
WARN_ON(test_and_set_bit(i, read_bitmap));
+   meta_list[i].lba = cpu_to_le64(lba);
advanced_bio = true;
 #ifdef CONFIG_NVM_DEBUG
atomic_long_inc(>cache_reads);
@@ -110,10 +113,26 @@ static int pblk_submit_read_io(struct pblk *pblk, struct 
nvm_rq *rqd)
return NVM_IO_OK;
 }
 
+static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd,
+  sector_t blba)
+{
+   struct pblk_sec_meta *meta_list = rqd->meta_list;
+   int nr_lbas = rqd->nr_ppas;
+   int i;
+
+   for (i = 0; i < nr_lbas; i++) {
+   u64 lba = le64_to_cpu(meta_list[i].lba);
+
+   if (lba == ADDR_EMPTY)
+   continue;
+
+   WARN(lba != blba + i, "pblk: corrupted read LBA\n");
+   }
+}
+
 static void pblk_end_io_read(struct nvm_rq *rqd)
 {
struct pblk *pblk = rqd->private;
-   struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
 
@@ -124,6 +143,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
 #endif
 
+   pblk_read_check(pblk, rqd, r_ctx->lba);
+
bio_put(bio);
if (r_ctx->private) {
struct bio *orig_bio = r_ctx->private;
@@ -149,15 +170,21 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
  unsigned long *read_bitmap)
 {
struct bio *new_bio, *bio = rqd->bio;
+   struct pblk_sec_meta *meta_list = rqd->meta_list;
struct bio_vec src_bv, dst_bv;
void *ppa_ptr = NULL;
void *src_p, *dst_p;
dma_addr_t dma_ppa_list = 0;
+   __le64 *lba_list_mem, *lba_list_media;
int nr_secs = rqd->nr_ppas;
int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
int i, ret, hole;
DECLARE_COMPLETION_ONSTACK(wait);
 
+   /* Re-use allocated memory for intermediate lbas */
+   lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
+   lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
+
new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -168,6 +195,9 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
goto err;
}
 
+   for (i = 0; i < nr_secs; i++)
+   lba_list_mem[i] = meta_list[i].lba;
+
new_bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
 
@@ -207,10 +237,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
rqd->dma_ppa_list = dma_ppa_list;
}
 
+   for (i = 0; i < nr_secs; i++) {
+   lba_list_media[i] = meta_list[i].lba;
+   meta_list[i].lba = lba_list_mem[i];
+   }
+

[PATCH] lightnvm: pblk: remove I/O dependency on write path

2017-09-15 Thread Javier González

pblk schedules user I/O, metadata I/O and erases on the write path in
order to minimize collisions at the media level. Until now, there has
been a dependency between user and metadata I/Os that could lead to a
deadlock as both take the per-LUN semaphore to schedule submission.

This path removes this dependency and guarantees forward progress at a
per I/O granurality.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-write.c | 145 +++---
 1 file changed, 65 insertions(+), 80 deletions(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index cec526759547..89a1b46ec728 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -223,15 +223,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct 
nvm_rq *rqd,
 }
 
 static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-  struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa)
+  struct ppa_addr *erase_ppa)
 {
struct pblk_line_meta *lm = >lm;
struct pblk_line *e_line = pblk_line_get_erase(pblk);
+   struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
unsigned int valid = c_ctx->nr_valid;
unsigned int padded = c_ctx->nr_padded;
unsigned int nr_secs = valid + padded;
unsigned long *lun_bitmap;
-   int ret = 0;
+   int ret;
 
lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
if (!lun_bitmap)
@@ -297,55 +298,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, 
unsigned int secs_avail,
return secs_to_sync;
 }
 
-static inline int pblk_valid_meta_ppa(struct pblk *pblk,
- struct pblk_line *meta_line,
- struct ppa_addr *ppa_list, int nr_ppas)
-{
-   struct nvm_tgt_dev *dev = pblk->dev;
-   struct nvm_geo *geo = >geo;
-   struct pblk_line *data_line;
-   struct ppa_addr ppa, ppa_opt;
-   u64 paddr;
-   int i;
-
-   data_line = >lines[pblk_dev_ppa_to_line(ppa_list[0])];
-   paddr = pblk_lookup_page(pblk, meta_line);
-   ppa = addr_to_gen_ppa(pblk, paddr, 0);
-
-   if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap))
-   return 1;
-
-   /* Schedule a metadata I/O that is half the distance from the data I/O
-* with regards to the number of LUNs forming the pblk instance. This
-* balances LUN conflicts across every I/O.
-*
-* When the LUN configuration changes (e.g., due to GC), this distance
-* can align, which would result on a LUN deadlock. In this case, modify
-* the distance to not be optimal, but allow metadata I/Os to succeed.
-*/
-   ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0);
-   if (unlikely(ppa_opt.ppa == ppa.ppa)) {
-   data_line->meta_distance--;
-   return 0;
-   }
-
-   for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
-   if (ppa_list[i].g.ch == ppa_opt.g.ch &&
-   ppa_list[i].g.lun == ppa_opt.g.lun)
-   return 1;
-
-   if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) {
-   for (i = 0; i < nr_ppas; i += pblk->min_write_pgs)
-   if (ppa_list[i].g.ch == ppa.g.ch &&
-   ppa_list[i].g.lun == ppa.g.lun)
-   return 0;
-
-   return 1;
-   }
-
-   return 0;
-}
-
 int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 {
struct nvm_tgt_dev *dev = pblk->dev;
@@ -424,8 +376,44 @@ int pblk_submit_meta_io(struct pblk *pblk, struct 
pblk_line *meta_line)
return ret;
 }
 
-static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list,
-  int prev_n)
+static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
+  struct pblk_line *meta_line,
+  struct nvm_rq *data_rqd)
+{
+   struct nvm_tgt_dev *dev = pblk->dev;
+   struct nvm_geo *geo = >geo;
+   struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
+   struct pblk_line *data_line = pblk_line_get_data(pblk);
+   struct ppa_addr ppa, ppa_opt;
+   u64 paddr;
+   int pos_opt;
+
+   /* Schedule a metadata I/O that is half the distance from the data I/O
+* with regards to the number of LUNs forming the pblk instance. This
+* balances LUN conflicts across every I/O.
+*
+* When the LUN configuration changes (e.g., due to GC), this distance
+* can align, which would result on metadata and data I/Os colliding. In
+* this case, modify the distance to not be optimal, but move the
+* optimal in the right direction.
+*/
+   paddr = pblk_lookup_page(pblk,

[PATCH] lightnvm: pblk: ensure right bad block calculation

2017-09-15 Thread Javier González

Make sure that the variable controlling block threshold for allocating
extra metadata sectors in case of a line with bad blocks does not get a
negative value. Otherwise, the line will be marked as corrupted and
wasted.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 9d9adcf3e141..7cf4b536d899 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -707,7 +707,7 @@ static int pblk_lines_init(struct pblk *pblk)
goto add_emeta_page;
}
 
-   lm->emeta_bb = geo->nr_luns - i;
+   lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
 
lm->min_blk_line = 1;
if (geo->nr_luns > 1)
-- 
2.7.4

[PATCH] lightnvm: pblk: enable 1 LUN configuration

2017-09-15 Thread Javier González

Metadata I/Os are scheduled to minimize their impact on user data I/Os.
When there are enough LUNs instantiated (i.e., enought bandwidth), it is
easy to interleave metadata and data one after the other so that
metadata I/Os are the ones being blocked and not viceversa.

We do this by calculating the distance between the I/Os in terms of the
LUNs that are not in used, and selecting a free LUN that satisfies a
the simple heuristic that medatata is scheduled behind. The per-LUN
semaphores guarantee consistency. This works fine on >1 LUN
configuration. However, when a signle LUN is instantiated, this design
leads to a deadlock, where metadata waits to be scheduled on a free LUN.

This patch implements the 1 LUN case by simply scheduling the medatada
I/O after the data I/O. In the process, we refactor the way a line is
replaced to ensure that metadata writes are submitted after data writes
in order to guarantee block sequentiality. Note that, since there is
only one LUN, both I/Os will block each other by design. However, such
configuration only pursues tight read latencies, not write bandwidth.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 17 ++---
 drivers/lightnvm/pblk-init.c |  8 ++--
 drivers/lightnvm/pblk-map.c  | 21 -
 drivers/lightnvm/pblk.h  |  2 +-
 4 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index d2587e37034f..64a6a255514e 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1370,17 +1370,17 @@ void pblk_pipeline_stop(struct pblk *pblk)
spin_unlock(_mg->free_lock);
 }
 
-void pblk_line_replace_data(struct pblk *pblk)
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 {
struct pblk_line_mgmt *l_mg = >l_mg;
-   struct pblk_line *cur, *new;
+   struct pblk_line *cur, *new = NULL;
unsigned int left_seblks;
int is_next = 0;
 
cur = l_mg->data_line;
new = l_mg->data_next;
if (!new)
-   return;
+   goto out;
l_mg->data_line = new;
 
spin_lock(_mg->free_lock);
@@ -1388,7 +1388,7 @@ void pblk_line_replace_data(struct pblk *pblk)
l_mg->data_line = NULL;
l_mg->data_next = NULL;
spin_unlock(_mg->free_lock);
-   return;
+   goto out;
}
 
pblk_line_setup_metadata(new, l_mg, >lm);
@@ -1400,7 +1400,7 @@ void pblk_line_replace_data(struct pblk *pblk)
/* If line is not fully erased, erase it */
if (atomic_read(>left_eblks)) {
if (pblk_line_erase(pblk, new))
-   return;
+   goto out;
} else {
io_schedule();
}
@@ -1411,7 +1411,7 @@ void pblk_line_replace_data(struct pblk *pblk)
if (!pblk_line_init_metadata(pblk, new, cur)) {
new = pblk_line_retry(pblk, new);
if (!new)
-   return;
+   goto out;
 
goto retry_setup;
}
@@ -1419,7 +1419,7 @@ void pblk_line_replace_data(struct pblk *pblk)
if (!pblk_line_init_bb(pblk, new, 1)) {
new = pblk_line_retry(pblk, new);
if (!new)
-   return;
+   goto out;
 
goto retry_setup;
}
@@ -1443,6 +1443,9 @@ void pblk_line_replace_data(struct pblk *pblk)
 
if (is_next)
pblk_rl_free_lines_dec(>rl, l_mg->data_next);
+
+out:
+   return new;
 }
 
 void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 670e5858611e..9d9adcf3e141 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -708,8 +708,12 @@ static int pblk_lines_init(struct pblk *pblk)
}
 
lm->emeta_bb = geo->nr_luns - i;
-   lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0],
-   geo->sec_per_blk);
+
+   lm->min_blk_line = 1;
+   if (geo->nr_luns > 1)
+   lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
+   lm->emeta_sec[0], geo->sec_per_blk);
+
if (lm->min_blk_line > lm->blk_per_line) {
pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
lm->blk_per_line);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index fddb924f6dde..3bc4c94f9cf2 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -25,13 +25,23 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned 
int sentry,
   unsigned int valid_secs)
 {
struct pblk_line *line

[PATCH] lightnvm: pblk: guarantee line integrity on reads

2017-09-15 Thread Javier González

When a line is recycled during garbage collection, reads can still be
issued to the line. If the line is freed in the middle of this process,
data corruption might occur.

This patch guarantees that lines are not freed in the middle of reads
that target them (lines). Specifically, we use the existing line
reference to decide when a line is eligible for being freed after the
recycle process.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 56 ++
 drivers/lightnvm/pblk-init.c | 14 +++--
 drivers/lightnvm/pblk-read.c | 71 +---
 drivers/lightnvm/pblk.h  |  2 ++
 4 files changed, 118 insertions(+), 25 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 5e6a881e3434..d2587e37034f 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1458,10 +1458,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line 
*line)
line->emeta = NULL;
 }
 
-void pblk_line_put(struct kref *ref)
+static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line)
 {
-   struct pblk_line *line = container_of(ref, struct pblk_line, ref);
-   struct pblk *pblk = line->pblk;
struct pblk_line_mgmt *l_mg = >l_mg;
 
spin_lock(>lock);
@@ -1479,6 +1477,43 @@ void pblk_line_put(struct kref *ref)
pblk_rl_free_lines_inc(>rl, line);
 }
 
+static void pblk_line_put_ws(struct work_struct *work)
+{
+   struct pblk_line_ws *line_put_ws = container_of(work,
+   struct pblk_line_ws, ws);
+   struct pblk *pblk = line_put_ws->pblk;
+   struct pblk_line *line = line_put_ws->line;
+
+   __pblk_line_put(pblk, line);
+   mempool_free(line_put_ws, pblk->gen_ws_pool);
+}
+
+void pblk_line_put(struct kref *ref)
+{
+   struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+   struct pblk *pblk = line->pblk;
+
+   __pblk_line_put(pblk, line);
+}
+
+void pblk_line_put_wq(struct kref *ref)
+{
+   struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+   struct pblk *pblk = line->pblk;
+   struct pblk_line_ws *line_put_ws;
+
+   line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC);
+   if (!line_put_ws)
+   return;
+
+   line_put_ws->pblk = pblk;
+   line_put_ws->line = line;
+   line_put_ws->priv = NULL;
+
+   INIT_WORK(_put_ws->ws, pblk_line_put_ws);
+   queue_work(pblk->r_end_wq, _put_ws->ws);
+}
+
 int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 {
struct nvm_rq *rqd;
@@ -1886,8 +1921,19 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct 
ppa_addr *ppas,
int i;
 
spin_lock(>trans_lock);
-   for (i = 0; i < nr_secs; i++)
-   ppas[i] = pblk_trans_map_get(pblk, blba + i);
+   for (i = 0; i < nr_secs; i++) {
+   struct ppa_addr ppa;
+
+   ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i);
+
+   /* If the L2P entry maps to a line, the reference is valid */
+   if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
+   int line_id = pblk_dev_ppa_to_line(ppa);
+   struct pblk_line *line = >lines[line_id];
+
+   kref_get(>ref);
+   }
+   }
spin_unlock(>trans_lock);
 }
 
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index df86cf98ef4d..670e5858611e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -271,15 +271,22 @@ static int pblk_core_init(struct pblk *pblk)
if (!pblk->bb_wq)
goto free_close_wq;
 
+   pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq",
+   WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+   if (!pblk->r_end_wq)
+   goto free_bb_wq;
+
if (pblk_set_ppaf(pblk))
-   goto free_bb_wq;
+   goto free_r_end_wq;
 
if (pblk_rwb_init(pblk))
-   goto free_bb_wq;
+   goto free_r_end_wq;
 
INIT_LIST_HEAD(>compl_list);
return 0;
 
+free_r_end_wq:
+   destroy_workqueue(pblk->r_end_wq);
 free_bb_wq:
destroy_workqueue(pblk->bb_wq);
 free_close_wq:
@@ -304,6 +311,9 @@ static void pblk_core_free(struct pblk *pblk)
if (pblk->close_wq)
destroy_workqueue(pblk->close_wq);
 
+   if (pblk->r_end_wq)
+   destroy_workqueue(pblk->r_end_wq);
+
if (pblk->bb_wq)
destroy_workqueue(pblk->bb_wq);
 
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index a465d9980df4..402f8eff6a2e 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -130,9 +130,34 @@ static void pblk_read_check(struct pblk *pblk, struct 
nvm_rq *rqd,
}
 }
 
-static void pblk_end_io_read(struct nvm_rq *rqd)
+static void

[PATCH 01/11] lightnvm: pblk: use constant for GC max inflight

2017-09-15 Thread Javier González

Use a constant to set the maximum number of inflight GC requests
allowed.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-gc.c | 4 ++--
 drivers/lightnvm/pblk.h| 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index f163829ecca8..c21b2077432a 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -93,7 +93,7 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct 
pblk_gc_rq *gc_rq)
 
 retry:
spin_lock(>w_lock);
-   if (gc->w_entries >= PBLK_GC_W_QD) {
+   if (gc->w_entries >= PBLK_GC_RQ_QD) {
spin_unlock(>w_lock);
pblk_gc_writer_kick(>gc);
usleep_range(128, 256);
@@ -602,7 +602,7 @@ int pblk_gc_init(struct pblk *pblk)
spin_lock_init(>w_lock);
spin_lock_init(>r_lock);
 
-   sema_init(>gc_sem, 128);
+   sema_init(>gc_sem, PBLK_GC_RQ_QD);
 
INIT_LIST_HEAD(>w_list);
INIT_LIST_HEAD(>r_list);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 469976a20421..afd967b55d94 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -815,7 +815,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct 
pblk_c_ctx *c_ctx,
  * pblk gc
  */
 #define PBLK_GC_MAX_READERS 8  /* Max number of outstanding GC reader jobs */
-#define PBLK_GC_W_QD 128   /* Queue depth for inflight GC write I/Os */
+#define PBLK_GC_RQ_QD 128  /* Queue depth for inflight GC requests */
 #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
 #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
 
-- 
2.7.4

[PATCH 02/11] lightnvm: pblk: normalize ppa namings

2017-09-15 Thread Javier González

Normalize the way we name ppa variables to improve code readability.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 48 +++-
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 3004e9252a11..9e90aeedf926 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1754,7 +1754,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr 
*ppa_list, int nr_ppas,
 
 void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
 {
-   struct ppa_addr l2p_ppa;
+   struct ppa_addr ppa_l2p;
 
/* logic error: lba out-of-bounds. Ignore update */
if (!(lba < pblk->rl.nr_secs)) {
@@ -1763,10 +1763,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, 
struct ppa_addr ppa)
}
 
spin_lock(>trans_lock);
-   l2p_ppa = pblk_trans_map_get(pblk, lba);
+   ppa_l2p = pblk_trans_map_get(pblk, lba);
 
-   if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
-   pblk_map_invalidate(pblk, l2p_ppa);
+   if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p))
+   pblk_map_invalidate(pblk, ppa_l2p);
 
pblk_trans_map_set(pblk, lba, ppa);
spin_unlock(>trans_lock);
@@ -1783,16 +1783,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t 
lba, struct ppa_addr ppa)
pblk_update_map(pblk, lba, ppa);
 }
 
-int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr 
ppa_new,
   struct pblk_line *gc_line)
 {
-   struct ppa_addr l2p_ppa;
+   struct ppa_addr ppa_l2p;
int ret = 1;
 
 #ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a cache address */
-   BUG_ON(!pblk_addr_in_cache(ppa));
-   BUG_ON(pblk_rb_pos_oob(>rwb, pblk_addr_to_cacheline(ppa)));
+   BUG_ON(!pblk_addr_in_cache(ppa_new));
+   BUG_ON(pblk_rb_pos_oob(>rwb, pblk_addr_to_cacheline(ppa_new)));
 #endif
 
/* logic error: lba out-of-bounds. Ignore update */
@@ -1802,36 +1802,38 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, 
struct ppa_addr ppa,
}
 
spin_lock(>trans_lock);
-   l2p_ppa = pblk_trans_map_get(pblk, lba);
+   ppa_l2p = pblk_trans_map_get(pblk, lba);
 
/* Prevent updated entries to be overwritten by GC */
-   if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
-   pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+   if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) ||
+   pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) {
+
ret = 0;
goto out;
}
 
-   pblk_trans_map_set(pblk, lba, ppa);
+   pblk_trans_map_set(pblk, lba, ppa_new);
 out:
spin_unlock(>trans_lock);
return ret;
 }
 
-void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
-struct ppa_addr entry_line)
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache)
 {
-   struct ppa_addr l2p_line;
+   struct ppa_addr ppa_l2p;
 
 #ifdef CONFIG_NVM_DEBUG
/* Callers must ensure that the ppa points to a device address */
-   BUG_ON(pblk_addr_in_cache(ppa));
+   BUG_ON(pblk_addr_in_cache(ppa_mapped));
 #endif
/* Invalidate and discard padded entries */
if (lba == ADDR_EMPTY) {
 #ifdef CONFIG_NVM_DEBUG
atomic_long_inc(>padded_wb);
 #endif
-   pblk_map_invalidate(pblk, ppa);
+   if (!pblk_ppa_empty(ppa_mapped))
+   pblk_map_invalidate(pblk, ppa_mapped);
return;
}
 
@@ -1842,22 +1844,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t 
lba, struct ppa_addr ppa,
}
 
spin_lock(>trans_lock);
-   l2p_line = pblk_trans_map_get(pblk, lba);
+   ppa_l2p = pblk_trans_map_get(pblk, lba);
 
/* Do not update L2P if the cacheline has been updated. In this case,
 * the mapped ppa must be invalidated
 */
-   if (l2p_line.ppa != entry_line.ppa) {
-   if (!pblk_ppa_empty(ppa))
-   pblk_map_invalidate(pblk, ppa);
+   if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) {
+   if (!pblk_ppa_empty(ppa_mapped))
+   pblk_map_invalidate(pblk, ppa_mapped);
goto out;
}
 
 #ifdef CONFIG_NVM_DEBUG
-   WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+   WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p));
 #endif
 
-   pblk_trans_map_set(pblk, lba, ppa);
+   pblk_trans_map_set(pblk, lba, ppa_mapped);
 out:
spin_unlock(>trans_lock);
 }
--

[PATCH 03/11] lightnvm: pblk: refactor read lba sanity check

2017-09-15 Thread Javier González

Refactor lba sanity check on read path to avoid code duplication.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-read.c | 29 ++---
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d2b6e2a7d7d5..eaaf9d55ba97 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -39,21 +39,14 @@ static int pblk_read_from_cache(struct pblk *pblk, struct 
bio *bio,
 }
 
 static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
-unsigned long *read_bitmap)
+sector_t blba, unsigned long *read_bitmap)
 {
struct bio *bio = rqd->bio;
struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
-   sector_t blba = pblk_get_lba(bio);
int nr_secs = rqd->nr_ppas;
bool advanced_bio = false;
int i, j = 0;
 
-   /* logic error: lba out-of-bounds. Ignore read request */
-   if (blba + nr_secs >= pblk->rl.nr_secs) {
-   WARN(1, "pblk: read lbas out of bounds\n");
-   return;
-   }
-
pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
 
for (i = 0; i < nr_secs; i++) {
@@ -259,17 +252,10 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
 }
 
 static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
-unsigned long *read_bitmap)
+sector_t lba, unsigned long *read_bitmap)
 {
struct bio *bio = rqd->bio;
struct ppa_addr ppa;
-   sector_t lba = pblk_get_lba(bio);
-
-   /* logic error: lba out-of-bounds. Ignore read request */
-   if (lba >= pblk->rl.nr_secs) {
-   WARN(1, "pblk: read lba out of bounds\n");
-   return;
-   }
 
pblk_lookup_l2p_seq(pblk, , lba, 1);
 
@@ -305,14 +291,19 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq 
*rqd,
 int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 {
struct nvm_tgt_dev *dev = pblk->dev;
+   sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
struct nvm_rq *rqd;
unsigned long read_bitmap; /* Max 64 ppas per request */
unsigned int bio_init_idx;
int ret = NVM_IO_ERR;
 
-   if (nr_secs > PBLK_MAX_REQ_ADDRS)
+   /* logic error: lba out-of-bounds. Ignore read request */
+   if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) {
+   WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n",
+   (unsigned long long)blba, nr_secs);
return NVM_IO_ERR;
+   }
 
bitmap_zero(_bitmap, nr_secs);
 
@@ -340,9 +331,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
 
-   pblk_read_ppalist_rq(pblk, rqd, _bitmap);
+   pblk_read_ppalist_rq(pblk, rqd, blba, _bitmap);
} else {
-   pblk_read_rq(pblk, rqd, _bitmap);
+   pblk_read_rq(pblk, rqd, blba, _bitmap);
}
 
bio_get(bio);
-- 
2.7.4

[PATCH 04/11] lightnvm: pblk: simplify data validity check on GC

2017-09-15 Thread Javier González

When a line is selected for recycling by the garbage collector (GC), the
line state changes and the invalid bitmap is frozen, preventing
invalidations from happening. Throughout the GC, the L2P map is checked
to verify that not data being recycled has been updated. The last check
is done before the new map is being stored on the L2P table. Though
this algorithm works, it requires a number of corner cases to be checked
each time the L2P table is being updated. This complicates readability
and is error prone in case that the recycling algorithm is modified.

Instead, this patch makes the invalid bitmap accessible even when the
line is being recycled. When recycled data is being remapped, it is
enough to check the invalid bitmap for the line before updating the L2P
table.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-cache.c | 20 +--
 drivers/lightnvm/pblk-core.c  | 29 +++-
 drivers/lightnvm/pblk-gc.c| 60 ++--
 drivers/lightnvm/pblk-rb.c|  6 ++--
 drivers/lightnvm/pblk-read.c  | 79 +++
 drivers/lightnvm/pblk.h   | 23 -
 6 files changed, 110 insertions(+), 107 deletions(-)

diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 024a8fc93069..1d6b8e3585f1 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -73,12 +73,11 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, 
unsigned long flags)
  * On GC the incoming lbas are not necessarily sequential. Also, some of the
  * lbas might not be valid entries, which are marked as empty by the GC thread
  */
-int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
-  unsigned int nr_entries, unsigned int nr_rec_entries,
-  struct pblk_line *gc_line, unsigned long flags)
+int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
 {
struct pblk_w_ctx w_ctx;
unsigned int bpos, pos;
+   void *data = gc_rq->data;
int i, valid_entries;
 
/* Update the write buffer head (mem) with the entries that we can
@@ -86,28 +85,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, 
u64 *lba_list,
 * rollback from here on.
 */
 retry:
-   if (!pblk_rb_may_write_gc(>rwb, nr_rec_entries, )) {
+   if (!pblk_rb_may_write_gc(>rwb, gc_rq->secs_to_gc, )) {
io_schedule();
goto retry;
}
 
-   w_ctx.flags = flags;
+   w_ctx.flags = PBLK_IOTYPE_GC;
pblk_ppa_set_empty(_ctx.ppa);
 
-   for (i = 0, valid_entries = 0; i < nr_entries; i++) {
-   if (lba_list[i] == ADDR_EMPTY)
+   for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) {
+   if (gc_rq->lba_list[i] == ADDR_EMPTY)
continue;
 
-   w_ctx.lba = lba_list[i];
+   w_ctx.lba = gc_rq->lba_list[i];
 
pos = pblk_rb_wrap_pos(>rwb, bpos + valid_entries);
-   pblk_rb_write_entry_gc(>rwb, data, w_ctx, gc_line, pos);
+   pblk_rb_write_entry_gc(>rwb, data, w_ctx, gc_rq->line,
+   gc_rq->paddr_list[i], pos);
 
data += PBLK_EXPOSED_PAGE_SIZE;
valid_entries++;
}
 
-   WARN_ONCE(nr_rec_entries != valid_entries,
+   WARN_ONCE(gc_rq->secs_to_gc != valid_entries,
"pblk: inconsistent GC write\n");
 
 #ifdef CONFIG_NVM_DEBUG
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 9e90aeedf926..2c9b3a5bf3c2 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -78,11 +78,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct 
pblk_line *line,
 * that newer updates are not overwritten.
 */
spin_lock(>lock);
-   if (line->state == PBLK_LINESTATE_GC ||
-   line->state == PBLK_LINESTATE_FREE) {
-   spin_unlock(>lock);
-   return;
-   }
+   WARN_ON(line->state == PBLK_LINESTATE_FREE);
 
if (test_and_set_bit(paddr, line->invalid_bitmap)) {
WARN_ONCE(1, "pblk: double invalidate\n");
@@ -99,8 +95,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct 
pblk_line *line,
spin_lock(_mg->gc_lock);
spin_lock(>lock);
/* Prevent moving a line that has just been chosen for GC */
-   if (line->state == PBLK_LINESTATE_GC ||
-   line->state == PBLK_LINESTATE_FREE) {
+   if (line->state == PBLK_LINESTATE_GC) {
spin_unlock(>lock);
spin_unlock(_mg->gc_lock);
return;
@@ -1774,6 +1769,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, 
struct ppa_addr ppa)
 
 void

[PATCH 06/11] lightnvm: pblk: put bio on bio completion

2017-09-15 Thread Javier González

Simplify put bio by doing it on bio end_io instead of manually putting
it on the completion path.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 10 +++---
 drivers/lightnvm/pblk-read.c |  1 -
 drivers/lightnvm/pblk-recovery.c |  1 -
 drivers/lightnvm/pblk-write.c|  8 +---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 2c9b3a5bf3c2..55d472beee24 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -417,6 +417,11 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
return nvm_submit_io(dev, rqd);
 }
 
+static void pblk_bio_map_addr_endio(struct bio *bio)
+{
+   bio_put(bio);
+}
+
 struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
  unsigned int nr_secs, unsigned int len,
  int alloc_type, gfp_t gfp_mask)
@@ -453,6 +458,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 
kaddr += PAGE_SIZE;
}
+
+   bio->bi_end_io = pblk_bio_map_addr_endio;
 out:
return bio;
 }
@@ -669,9 +676,6 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
atomic_dec(>inflight_io);
reinit_completion();
 
-   if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META))
-   bio_put(bio);
-
if (rqd.error) {
if (dir == WRITE)
pblk_log_write_err(pblk, );
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index c28d6509312e..e7141b1aaded 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -531,7 +531,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct 
pblk_gc_rq *gc_rq)
atomic_long_sub(gc_rq->secs_to_gc, >inflight_reads);
 #endif
 
-   bio_put(bio);
 out:
nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
return ret;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 573085bdb7e5..b3950ab862f9 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -333,7 +333,6 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 
pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
 
-   bio_put(rqd->bio);
nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
pblk_free_rqd(pblk, rqd, WRITE);
 
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 939eb6df2265..4cea2f76d4e2 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -191,17 +191,12 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd)
pblk_log_write_err(pblk, rqd);
pr_err("pblk: metadata I/O failed. Line %d\n", line->id);
}
-#ifdef CONFIG_NVM_DEBUG
-   else
-   WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
-#endif
 
sync = atomic_add_return(rqd->nr_ppas, >sync);
if (sync == emeta->nr_entries)
pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws,
GFP_ATOMIC, pblk->close_wq);
 
-   bio_put(rqd->bio);
nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
pblk_free_rqd(pblk, rqd, READ);
 
@@ -430,8 +425,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line 
*meta_line)
 
nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
 fail_free_bio:
-   if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META))
-   bio_put(bio);
+   bio_put(bio);
 fail_free_rqd:
pblk_free_rqd(pblk, rqd, READ);
return ret;
-- 
2.7.4

[PATCH 05/11] lightnvm: pblk: refactor read path on GC

2017-09-15 Thread Javier González

Simplify the part of the garbage collector where data is read from the
line being recycled and moved into an internal queue before being copied
to the memory buffer. This allows to get rid of a dedicated function,
which introduces an unnecessary dependency on the code.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-gc.c | 94 +++---
 1 file changed, 39 insertions(+), 55 deletions(-)

diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 7ad0cfe58a21..7b103bce58bf 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -56,57 +56,6 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc)
wake_up_process(gc->gc_writer_ts);
 }
 
-/*
- * Responsible for managing all memory related to a gc request. Also in case of
- * failure
- */
-static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq)
-{
-   struct nvm_tgt_dev *dev = pblk->dev;
-   struct nvm_geo *geo = >geo;
-   struct pblk_gc *gc = >gc;
-   struct pblk_line *line = gc_rq->line;
-   void *data;
-   int ret = 0;
-
-   data = vmalloc(gc_rq->nr_secs * geo->sec_size);
-   if (!data) {
-   ret = -ENOMEM;
-   goto fail;
-   }
-
-   gc_rq->data = data;
-
-   /* Read from GC victim block */
-   ret = pblk_submit_read_gc(pblk, gc_rq);
-   if (ret)
-   goto fail;
-
-   if (!gc_rq->secs_to_gc)
-   goto fail;
-
-retry:
-   spin_lock(>w_lock);
-   if (gc->w_entries >= PBLK_GC_RQ_QD) {
-   spin_unlock(>w_lock);
-   pblk_gc_writer_kick(>gc);
-   usleep_range(128, 256);
-   goto retry;
-   }
-   gc->w_entries++;
-   list_add_tail(_rq->list, >w_list);
-   spin_unlock(>w_lock);
-
-   pblk_gc_writer_kick(>gc);
-
-   return 0;
-
-fail:
-   pblk_gc_free_gc_rq(gc_rq);
-   kref_put(>ref, pblk_line_put);
-   return ret;
-}
-
 static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
 {
struct pblk_line_mgmt *l_mg = >l_mg;
@@ -130,18 +79,53 @@ static void pblk_gc_line_ws(struct work_struct *work)
struct pblk_line_ws *gc_rq_ws = container_of(work,
struct pblk_line_ws, ws);
struct pblk *pblk = gc_rq_ws->pblk;
+   struct nvm_tgt_dev *dev = pblk->dev;
+   struct nvm_geo *geo = >geo;
struct pblk_gc *gc = >gc;
struct pblk_line *line = gc_rq_ws->line;
struct pblk_gc_rq *gc_rq = gc_rq_ws->priv;
+   int ret;
 
up(>gc_sem);
 
-   if (pblk_gc_move_valid_secs(pblk, gc_rq)) {
-   pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n",
-   line->id, *line->vsc,
-   gc_rq->nr_secs);
+   gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size);
+   if (!gc_rq->data) {
+   pr_err("pblk: could not GC line:%d (%d/%d)\n",
+   line->id, *line->vsc, gc_rq->nr_secs);
+   goto out;
}
 
+   /* Read from GC victim block */
+   ret = pblk_submit_read_gc(pblk, gc_rq);
+   if (ret) {
+   pr_err("pblk: failed GC read in line:%d (err:%d)\n",
+   line->id, ret);
+   goto out;
+   }
+
+   if (!gc_rq->secs_to_gc)
+   goto out;
+
+retry:
+   spin_lock(>w_lock);
+   if (gc->w_entries >= PBLK_GC_RQ_QD) {
+   spin_unlock(>w_lock);
+   pblk_gc_writer_kick(>gc);
+   usleep_range(128, 256);
+   goto retry;
+   }
+   gc->w_entries++;
+   list_add_tail(_rq->list, >w_list);
+   spin_unlock(>w_lock);
+
+   pblk_gc_writer_kick(>gc);
+
+   kfree(gc_rq_ws);
+   return;
+
+out:
+   pblk_gc_free_gc_rq(gc_rq);
+   kref_put(>ref, pblk_line_put);
kfree(gc_rq_ws);
 }
 
-- 
2.7.4

[PATCH 07/11] lightnvm: pblk: simplify path on REQ_PREFLUSH

2017-09-15 Thread Javier González

On REQ_PREFLUSH, directly tag the I/O context flags to signal a flush in
the write to cache path, instead of finding the correct entry context
and imposing a memory barrier. This simplifies the code and might
potentially prevent race conditions when adding functionality to the
write path.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-cache.c | 4 +++-
 drivers/lightnvm/pblk-rb.c| 8 +---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 1d6b8e3585f1..0d227ef7d1b9 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -43,8 +43,10 @@ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, 
unsigned long flags)
if (unlikely(!bio_has_data(bio)))
goto out;
 
+   pblk_ppa_set_empty(_ctx.ppa);
w_ctx.flags = flags;
-   pblk_ppa_set_empty(_ctx.ppa);
+   if (bio->bi_opf & REQ_PREFLUSH)
+   w_ctx.flags |= PBLK_FLUSH_ENTRY;
 
for (i = 0; i < nr_entries; i++) {
void *data = bio_data(bio);
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 74c768ce09ef..05e6b2e9221d 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -355,7 +355,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, 
struct bio *bio,
 {
struct pblk_rb_entry *entry;
unsigned int subm, sync_point;
-   int flags;
 
subm = READ_ONCE(rb->subm);
 
@@ -369,12 +368,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, 
struct bio *bio,
sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
entry = >entries[sync_point];
 
-   flags = READ_ONCE(entry->w_ctx.flags);
-   flags |= PBLK_FLUSH_ENTRY;
-
-   /* Release flags on context. Protect from writes */
-   smp_store_release(>w_ctx.flags, flags);
-
/* Protect syncs */
smp_store_release(>sync_point, sync_point);
 
@@ -454,6 +447,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, 
unsigned int nr_entries,
 
/* Protect from read count */
smp_store_release(>mem, mem);
+
return 1;
 }
 
-- 
2.7.4

[PATCH 09/11] lightnvm: pblk: improve naming for internal req.

2017-09-15 Thread Javier González

Each request type sent to the LightNVM subsystem requires different
metadata. Until now, we have tailored this metadata based on write, read
and erase commands. However, pblk uses different metadata for internal
writes that do not hit the write buffer. Instead of abusing the metadata
for reads, create a new request type - internal write to improve
code readability.

In the process, create internal values for each I/O type instead of
abusing the READ/WRITE macros, as suggested by Christoph.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 32 
 drivers/lightnvm/pblk-read.c |  6 +++---
 drivers/lightnvm/pblk-recovery.c | 12 ++--
 drivers/lightnvm/pblk-write.c| 16 
 drivers/lightnvm/pblk.h  | 11 ---
 5 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 55d472beee24..1ac54b2bba26 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -152,7 +152,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
struct nvm_rq *rqd;
int rq_size;
 
-   if (rw == WRITE) {
+   if (rw == PBLK_WRITE) {
pool = pblk->w_rq_pool;
rq_size = pblk_w_rq_size;
} else {
@@ -170,7 +170,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, 
int rw)
 {
mempool_t *pool;
 
-   if (rw == WRITE)
+   if (rw == PBLK_WRITE)
pool = pblk->w_rq_pool;
else
pool = pblk->r_rq_pool;
@@ -567,10 +567,10 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
int ret;
DECLARE_COMPLETION_ONSTACK(wait);
 
-   if (dir == WRITE) {
+   if (dir == PBLK_WRITE) {
bio_op = REQ_OP_WRITE;
cmd_op = NVM_OP_PWRITE;
-   } else if (dir == READ) {
+   } else if (dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
} else
@@ -610,10 +610,10 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
rqd.end_io = pblk_end_io_sync;
rqd.private = 
 
-   if (dir == WRITE) {
+   if (dir == PBLK_WRITE) {
struct pblk_sec_meta *meta_list = rqd.meta_list;
 
-   rqd.flags = pblk_set_progr_mode(pblk, WRITE);
+   rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
for (i = 0; i < rqd.nr_ppas; ) {
spin_lock(>lock);
paddr = __pblk_alloc_page(pblk, line, min);
@@ -677,7 +677,7 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, 
struct pblk_line *line,
reinit_completion();
 
if (rqd.error) {
-   if (dir == WRITE)
+   if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, );
else
pblk_log_read_err(pblk, );
@@ -720,12 +720,12 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, 
struct pblk_line *line,
int flags;
DECLARE_COMPLETION_ONSTACK(wait);
 
-   if (dir == WRITE) {
+   if (dir == PBLK_WRITE) {
bio_op = REQ_OP_WRITE;
cmd_op = NVM_OP_PWRITE;
-   flags = pblk_set_progr_mode(pblk, WRITE);
+   flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
-   } else if (dir == READ) {
+   } else if (dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -763,7 +763,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, 
struct pblk_line *line,
 
rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
 
-   if (dir == WRITE) {
+   if (dir == PBLK_WRITE) {
__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
meta_list[i].lba = lba_list[paddr] = addr_empty;
@@ -789,7 +789,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, 
struct pblk_line *line,
atomic_dec(>inflight_io);
 
if (rqd.error) {
-   if (dir == WRITE)
+   if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, );
else
pblk_log_read_err(pblk, );
@@ -805,14 +805,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct 
pblk_line *line)
 {
u64 bpaddr = pblk_line_smeta_start(pblk, line);
 
-   return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+   return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
 }
 
 int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
 void *emeta_buf)
 {
return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
-

[PATCH 10/11] lightnvm: pblk: refactor rqd alloc/free

2017-09-15 Thread Javier González

Refactor the rqd allocation and free functions so that all I/O types can
use these helper functions.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 40 ++--
 drivers/lightnvm/pblk-read.c |  2 --
 drivers/lightnvm/pblk-recovery.c |  2 --
 drivers/lightnvm/pblk-write.c|  7 ---
 drivers/lightnvm/pblk.h  |  4 ++--
 5 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 1ac54b2bba26..76c3b20dc3d5 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -146,18 +146,26 @@ static void pblk_invalidate_range(struct pblk *pblk, 
sector_t slba,
spin_unlock(>trans_lock);
 }
 
-struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+/* Caller must guarantee that the request is a valid type */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type)
 {
mempool_t *pool;
struct nvm_rq *rqd;
int rq_size;
 
-   if (rw == PBLK_WRITE) {
+   switch (type) {
+   case PBLK_WRITE:
+   case PBLK_WRITE_INT:
pool = pblk->w_rq_pool;
rq_size = pblk_w_rq_size;
-   } else {
+   break;
+   case PBLK_READ:
pool = pblk->r_rq_pool;
rq_size = pblk_g_rq_size;
+   break;
+   default:
+   pool = pblk->e_rq_pool;
+   rq_size = pblk_g_rq_size;
}
 
rqd = mempool_alloc(pool, GFP_KERNEL);
@@ -166,15 +174,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
return rqd;
 }
 
-void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+/* Typically used on completion path. Cannot guarantee request consistency */
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type)
 {
+   struct nvm_tgt_dev *dev = pblk->dev;
mempool_t *pool;
 
-   if (rw == PBLK_WRITE)
+   switch (type) {
+   case PBLK_WRITE:
+   kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap);
+   case PBLK_WRITE_INT:
pool = pblk->w_rq_pool;
-   else
+   break;
+   case PBLK_READ:
pool = pblk->r_rq_pool;
+   break;
+   case PBLK_ERASE:
+   pool = pblk->e_rq_pool;
+   break;
+   default:
+   pr_err("pblk: trying to free unknown rqd type\n");
+   return;
+   }
 
+   nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
mempool_free(rqd, pool);
 }
 
@@ -1468,8 +1491,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct 
ppa_addr ppa)
struct nvm_rq *rqd;
int err;
 
-   rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL);
-   memset(rqd, 0, pblk_g_rq_size);
+   rqd = pblk_alloc_rqd(pblk, PBLK_ERASE);
 
pblk_setup_e_rq(pblk, rqd, ppa);
 
@@ -1747,8 +1769,6 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr 
*ppa_list, int nr_ppas,
rlun = >luns[bit];
up(>wr_sem);
}
-
-   kfree(lun_bitmap);
 }
 
 void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 4b1722fbe5a0..d7c90c303540 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -124,8 +124,6 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
 #endif
 
-   nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
-
bio_put(bio);
if (r_ctx->private) {
struct bio *orig_bio = r_ctx->private;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 848950fbe71c..1869eef1a109 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -329,11 +329,9 @@ static void pblk_end_io_recov(struct nvm_rq *rqd)
 {
struct pblk_pad_rq *pad_rq = rqd->private;
struct pblk *pblk = pad_rq->pblk;
-   struct nvm_tgt_dev *dev = pblk->dev;
 
pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas);
 
-   nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list);
pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT);
 
atomic_dec(>inflight_io);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 7352b7d55a4d..cec526759547 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -20,7 +20,6 @@
 static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
 {
-   struct nvm_tgt_dev *dev = pblk->dev;
struct bio *original_bio;
unsigned long ret;
int i;
@@ -46,8 +45,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct 
nvm_rq *rqd,
 
ret = pblk_rb_sync_advance(>rwb,

[PATCH 08/11] lightnvm: pblk: allocate bio size more accurately

2017-09-15 Thread Javier González

Wait until we know the exact number of ppas to be sent to the device,
before allocating the bio.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-rb.c|  5 +++--
 drivers/lightnvm/pblk-write.c | 20 ++--
 drivers/lightnvm/pblk.h   |  4 ++--
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 05e6b2e9221d..1173e2380137 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -552,12 +552,13 @@ unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, 
struct bio *bio,
  * persist data on the write buffer to the media.
  */
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-struct bio *bio, unsigned int pos,
-unsigned int nr_entries, unsigned int count)
+unsigned int pos, unsigned int nr_entries,
+unsigned int count)
 {
struct pblk *pblk = container_of(rb, struct pblk, rwb);
struct request_queue *q = pblk->dev->q;
struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+   struct bio *bio = rqd->bio;
struct pblk_rb_entry *entry;
struct page *page;
unsigned int pad = 0, to_read = nr_entries;
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 4cea2f76d4e2..d440dbbe018d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -537,24 +537,24 @@ static int pblk_submit_write(struct pblk *pblk)
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1;
 
-   bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
-
-   bio->bi_iter.bi_sector = 0; /* internal bio */
-   bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-
-   rqd = pblk_alloc_rqd(pblk, WRITE);
-   rqd->bio = bio;
-
secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
if (secs_to_sync > pblk->max_write_pgs) {
pr_err("pblk: bad buffer sync calculation\n");
-   goto fail_put_bio;
+   return 1;
}
 
secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
pos = pblk_rb_read_commit(>rwb, secs_to_com);
 
-   if (pblk_rb_read_to_bio(>rwb, rqd, bio, pos, secs_to_sync,
+   bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+
+   bio->bi_iter.bi_sector = 0; /* internal bio */
+   bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+   rqd = pblk_alloc_rqd(pblk, WRITE);
+   rqd->bio = bio;
+
+   if (pblk_rb_read_to_bio(>rwb, rqd, pos, secs_to_sync,
secs_avail)) {
pr_err("pblk: corrupted write bio\n");
goto fail_put_bio;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 2b6ec6b7a794..74fa189a94e1 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -665,8 +665,8 @@ void pblk_rb_flush(struct pblk_rb *rb);
 
 void pblk_rb_sync_l2p(struct pblk_rb *rb);
 unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
-struct bio *bio, unsigned int pos,
-unsigned int nr_entries, unsigned int count);
+unsigned int pos, unsigned int nr_entries,
+unsigned int count);
 unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
  struct list_head *list,
  unsigned int max);
-- 
2.7.4

[PATCH 11/11] lightnvm: pblk: use rqd->end_io for completion

2017-09-15 Thread Javier González

For consistency with the rest of pblk, use rqd->end_io to point to the
function taking care of ending the request on the completion path.

Signed-off-by: Javier González 
---
 drivers/lightnvm/pblk-core.c | 7 ---
 drivers/lightnvm/pblk-read.c | 5 ++---
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 76c3b20dc3d5..5e6a881e3434 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -261,13 +261,6 @@ void pblk_write_should_kick(struct pblk *pblk)
pblk_write_kick(pblk);
 }
 
-void pblk_end_bio_sync(struct bio *bio)
-{
-   struct completion *waiting = bio->bi_private;
-
-   complete(waiting);
-}
-
 void pblk_end_io_sync(struct nvm_rq *rqd)
 {
struct completion *waiting = rqd->private;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index d7c90c303540..0299fc08291d 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -170,13 +170,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, 
struct nvm_rq *rqd,
 
new_bio->bi_iter.bi_sector = 0; /* internal bio */
bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
-   new_bio->bi_private = 
-   new_bio->bi_end_io = pblk_end_bio_sync;
 
rqd->bio = new_bio;
rqd->nr_ppas = nr_holes;
rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM);
-   rqd->end_io = NULL;
+   rqd->end_io = pblk_end_io_sync;
+   rqd->private = 
 
if (unlikely(nr_secs > 1 && nr_holes == 1)) {
ppa_ptr = rqd->ppa_list;
-- 
2.7.4

[PATCH 00/11] lightnvm: pblk: cleanup

2017-09-15 Thread Javier González

This patchset is a general cleanup to improve code readability.

Javier González (11):
  lightnvm: pblk: use constant for GC max inflight
  lightnvm: pblk: normalize ppa namings
  lightnvm: pblk: refactor read lba sanity check
  lightnvm: pblk: simplify data validity check on GC
  lightnvm: pblk: refactor read path on GC
  lightnvm: pblk: put bio on bio completion
  lightnvm: pblk: simplify path on REQ_PREFLUSH
  lightnvm: pblk: allocate bio size more accurately
  lightnvm: pblk: improve naming for internal req.
  lightnvm: pblk: refactor rqd alloc/free
  lightnvm: pblk: use rqd->end_io for completion

 drivers/lightnvm/pblk-cache.c|  24 +++---
 drivers/lightnvm/pblk-core.c | 156 +--
 drivers/lightnvm/pblk-gc.c   | 134 -
 drivers/lightnvm/pblk-rb.c   |  19 ++---
 drivers/lightnvm/pblk-read.c | 122 +++---
 drivers/lightnvm/pblk-recovery.c |  15 ++--
 drivers/lightnvm/pblk-write.c|  49 +---
 drivers/lightnvm/pblk.h  |  44 +--
 8 files changed, 271 insertions(+), 292 deletions(-)

-- 
2.7.4

Re: [PATCH V3 12/12] block: Introduce zoned I/O scheduler

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> The zoned I/O scheduler is mostly identical to mq-deadline and retains
> the same configuration attributes. The main difference is that the
> zoned scheduler will ensure that at any time at most one write request
> per sequential zone is in flight (has been dispatched to the disk) in
> order to protect against potential sequential write reordering
> resulting from the concurrent execution of request dispatch by multiple
> contexts.
> 
> This is achieved similarly to the legacy SCSI I/O path by write locking
> zones when a write requests is issued. Subsequent writes to the same
> zone have to wait for the completion of the previously issued write
> before being in turn dispatched to the disk. This ensures that
> sequential writes are processed in the correct order without needing
> any modification to the execution model of blk-mq. In addition, this
> also protects against write reordering at the HBA level (e.g. AHCI).
> 
> Signed-off-by: Damien Le Moal 
> ---
>  Documentation/block/zoned-iosched.txt |  48 ++
>  block/Kconfig.iosched |  12 +
>  block/Makefile|   1 +
>  block/zoned-iosched.c | 925 
> ++
>  4 files changed, 986 insertions(+)
>  create mode 100644 Documentation/block/zoned-iosched.txt
>  create mode 100644 block/zoned-iosched.c
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 11/12] scsi: sd_zbc: Disable zone write locking with scsi-mq

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> In the case of a ZBC disk used with scsi-mq, zone write locking does
> not prevent write reordering in sequential zones. Unlike the legacy
> case, zone locking is done after the command request is removed from
> the scheduler dispatch queue. That is, at the time of zone locking,
> the write command may already be out of order, making locking
> ineffective. Write order guarantees can only be provided by an
> adapted I/O scheduler.
> 
> Disable zone write locking in sd_zbc_write_lock_zone() if the disk is
> used with scsi-mq. As the disk zones_wlock bitmap is not necessry,
> do not allocate it.
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Bart Van Assche 
> Reviewed-by: Johannes Thumshirn 
> ---
>  drivers/scsi/sd_zbc.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 10/12] scsi: sd_zbc: Limit zone write locking to sequential zones

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> Zoned block devices have no write constraints for conventional zones.
> So write locking of conventional zones is not necessary and can even
> hurt performance by unnecessarily operating the disk under low queue
> depth. To avoid this, use the disk request queue seq_zones bitmap to
> allow any write to be issued to conventional zones, locking only
> sequential zones.
> 
> Signed-off-by: Damien Le Moal 
> ---
>  drivers/scsi/sd_zbc.c | 20 
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 
Reviwed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 09/12] scsi: sd_zbc: Initialize device queue zoned structure

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> Allocate and initialize the disk request queue zoned structure on disk
> revalidate. As the bitmap allocation for the seq_zones field of the
> zoned structure is identical to the allocation of the zones write lock
> bitmap, introduce the helper sd_zbc_alloc_zone_bitmap().
> Using this helper, wait for the disk capacity and number of zones to
> stabilize on the second revalidation pass to allocate and initialize
> the bitmaps.
> 
> Signed-off-by: Damien Le Moal 
> ---
>  drivers/scsi/sd_zbc.c | 119 
> +++---
>  1 file changed, 114 insertions(+), 5 deletions(-)
> 
I would've been slightly worried about locking; these things tend to be
reset during revalidate, which happens quite frequently for ATA drives.
You sure you don't need it here?

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 08/12] scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> The three values starting at byte 8 of the Zoned Block Device
> Characteristics VPD page B6h are 32 bits values, not 64bits. So use
> get_unaligned_be32() to retrieve the values and not get_unaligned_be64()
> 
> Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
> Cc: 
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Bart Van Assche 
> Reviewed-by: Johannes Thumshirn 
> ---
>  drivers/scsi/sd_zbc.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
> index 37b33adbb2ef..7e30d0c22930 100644
> --- a/drivers/scsi/sd_zbc.c
> +++ b/drivers/scsi/sd_zbc.c
> @@ -385,15 +385,15 @@ static int sd_zbc_read_zoned_characteristics(struct 
> scsi_disk *sdkp,
>   if (sdkp->device->type != TYPE_ZBC) {
>   /* Host-aware */
>   sdkp->urswrz = 1;
> - sdkp->zones_optimal_open = get_unaligned_be64([8]);
> - sdkp->zones_optimal_nonseq = get_unaligned_be64([12]);
> + sdkp->zones_optimal_open = get_unaligned_be32([8]);
> + sdkp->zones_optimal_nonseq = get_unaligned_be32([12]);
>   sdkp->zones_max_open = 0;
>   } else {
>   /* Host-managed */
>   sdkp->urswrz = buf[4] & 1;
>   sdkp->zones_optimal_open = 0;
>   sdkp->zones_optimal_nonseq = 0;
> - sdkp->zones_max_open = get_unaligned_be64([16]);
> + sdkp->zones_max_open = get_unaligned_be32([16]);
>   }
>  
>   return 0;
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 07/12] scsi: sd_zbc: Use well defined macros

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> instead of open coding, use the min() macro to calculate a report zones
> reply buffer length in sd_zbc_check_zone_size() and the round_up()
> macro for calculating the number of zones in sd_zbc_setup().
> 
> No functional change is introduced by this patch.
> 
> Signed-off-by: Damien Le Moal 
> Reviewed-by: Johannes Thumshirn 
> ---
>  drivers/scsi/sd_zbc.c | 12 
>  1 file changed, 4 insertions(+), 8 deletions(-)
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
> assignments and move the calculation of sdkp->zone_shift together
> with the assignment of the verified zone_blocks value in
> sd_zbc_check_zone_size().
> 
> No functional change is introduced by this patch.
> 
> Signed-off-by: Damien Le Moal 
> ---
>  drivers/scsi/sd_zbc.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
Reviewed-by: Hannes Reinecke 

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

Re: [PATCH V3 05/12] scsi: sd_zbc: Fix comments and indentation

2017-09-15 Thread Hannes Reinecke

On 09/15/2017 12:06 PM, Damien Le Moal wrote:
> Fix comments style (do not use documented comment style) and add some
> comments to clarify some functions. Also fix some functions signature
> indentation and remove a useless blank line in sd_zbc_read_zones().
> 
> No functional change is introduced by this patch.
> 
> Signed-off-by: Damien Le Moal 
> ---
>  drivers/scsi/scsi_lib.c |  5 +++-
>  drivers/scsi/sd_zbc.c   | 67 
> -
>  2 files changed, 53 insertions(+), 19 deletions(-)
> 
Actually you should've used kernel-doc style to format the comments;
just to keep the 0-day bot happy

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

[PATCH V3 12/12] block: Introduce zoned I/O scheduler

2017-09-15 Thread Damien Le Moal

The zoned I/O scheduler is mostly identical to mq-deadline and retains
the same configuration attributes. The main difference is that the
zoned scheduler will ensure that at any time at most one write request
per sequential zone is in flight (has been dispatched to the disk) in
order to protect against potential sequential write reordering
resulting from the concurrent execution of request dispatch by multiple
contexts.

This is achieved similarly to the legacy SCSI I/O path by write locking
zones when a write requests is issued. Subsequent writes to the same
zone have to wait for the completion of the previously issued write
before being in turn dispatched to the disk. This ensures that
sequential writes are processed in the correct order without needing
any modification to the execution model of blk-mq. In addition, this
also protects against write reordering at the HBA level (e.g. AHCI).

Signed-off-by: Damien Le Moal 
---
 Documentation/block/zoned-iosched.txt |  48 ++
 block/Kconfig.iosched |  12 +
 block/Makefile|   1 +
 block/zoned-iosched.c | 925 ++
 4 files changed, 986 insertions(+)
 create mode 100644 Documentation/block/zoned-iosched.txt
 create mode 100644 block/zoned-iosched.c

diff --git a/Documentation/block/zoned-iosched.txt 
b/Documentation/block/zoned-iosched.txt
new file mode 100644
index ..b269125bdc61
--- /dev/null
+++ b/Documentation/block/zoned-iosched.txt
@@ -0,0 +1,48 @@
+Zoned I/O scheduler
+===
+
+The Zoned I/O scheduler solves zoned block devices write ordering problems
+inherent to the absence of a global request queue lock in the blk-mq
+infrastructure. Multiple contexts may try to dispatch simultaneously write
+requests to the same sequential zone of a zoned block device, doing so
+potentially breaking the sequential write order imposed by the device.
+
+The Zoned I/O scheduler is based on the mq-deadline scheduler. It shares the
+same tunables and behaves in a comparable manner. The main difference 
introduced
+with the zoned scheduler is handling of write batches. Whereas mq-deadline will
+keep dispatching write requests to the device as long as the batching size
+allows and reads are not starved, the zoned scheduler introduces additional
+constraints:
+1) At most only one write request can be issued to a sequential zone of the
+device. This ensures that no reordering of sequential writes to a sequential
+zone can happen once the write request leaves the scheduler internal queue (rb
+tree).
+2) The number of sequential zones that can be simultaneously written is limited
+to the device advertized maximum number of open zones. This additional 
condition
+avoids performance degradation due to excessive open zone resource use at the
+device level.
+
+These conditions do not apply to write requests targeting conventional zones.
+For these, the zoned scheduler behaves exactly like the mq-deadline scheduler.
+
+The zoned I/O scheduler cannot be used with regular block devices. It can only
+be used with host-managed or host-aware zoned block devices.
+Using the zoned I/O scheduler is mandatory with host-managed disks unless the
+disk user tightly controls itself write sequencing to sequential zones. The
+zoned scheduler will treat host-aware disks exactly the same way as 
host-managed
+devices. That is, eventhough host aware disks can be randomly written, the 
zoned
+scheduler will still impose the limit to one write per zone so that sequential
+writes sequences are preserved.
+
+For host-managed disks, automating the used of the zoned scheduler can be done
+simply with a udev rule. An example of such rule is shown below.
+
+# Set zoned scheduler for host-managed zoned block devices
+ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/zoned}=="host-managed", \
+   ATTR{queue/scheduler}="zoned"
+
+Zoned I/O scheduler tunables
+
+
+Tunables of the Zoned I/O scheduler are identical to those of the deadline
+I/O scheduler. See Documentation/block/deadline-iosched.txt for details.
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index fd2cefa47d35..b87c67dbf1f6 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -70,6 +70,18 @@ config MQ_IOSCHED_DEADLINE
---help---
  MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_ZONED
+   tristate "Zoned I/O scheduler"
+   depends on BLK_DEV_ZONED
+   depends on SCSI_MOD
+   depends on BLK_DEV_SD
+   default y
+   ---help---
+ MQ deadline IO scheduler with support for zoned block devices.
+
+ This should be set as the default I/O scheduler for host-managed
+ zoned block devices. It is optional for host-aware block devices.
+
 config MQ_IOSCHED_KYBER
tristate "Kyber I/O scheduler"
default y
diff --git a/block/Makefile b/block/Makefile
index 9396ebc85d24..8f35c97a0199

[PATCH V3 09/12] scsi: sd_zbc: Initialize device queue zoned structure

2017-09-15 Thread Damien Le Moal

Allocate and initialize the disk request queue zoned structure on disk
revalidate. As the bitmap allocation for the seq_zones field of the
zoned structure is identical to the allocation of the zones write lock
bitmap, introduce the helper sd_zbc_alloc_zone_bitmap().
Using this helper, wait for the disk capacity and number of zones to
stabilize on the second revalidation pass to allocate and initialize
the bitmaps.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/sd_zbc.c | 119 +++---
 1 file changed, 114 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7e30d0c22930..534e747e8bcf 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -534,8 +534,103 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
return 0;
 }
 
+/*
+ * Allocate a zone bitmap (one bit per zone).
+ */
+static inline unsigned long *sd_zbc_alloc_zone_bitmap(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   return kzalloc_node(BITS_TO_LONGS(sdkp->nr_zones)
+   * sizeof(unsigned long),
+   GFP_KERNEL, q->node);
+}
+
+/*
+ * Initialize the seq_zones bitmap to allow identifying sequential zones.
+ */
+static int sd_zbc_setup_seq_zones(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+   unsigned long *seq_zones;
+   sector_t block = 0;
+   unsigned char *buf;
+   unsigned char *rec;
+   unsigned int buf_len;
+   unsigned int list_length;
+   unsigned int n = 0;
+   u8 type, cond;
+   int ret = -ENOMEM;
+
+   seq_zones = sd_zbc_alloc_zone_bitmap(sdkp);
+   if (!seq_zones)
+   return -ENOMEM;
+
+   buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   while (block < sdkp->capacity) {
+
+   ret = sd_zbc_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, block);
+   if (ret)
+   goto out;
+
+   /*
+* Parse reported zone descriptors to find sequiential zones.
+* Since read-only and offline zones cannot be written, do not
+* mark them as sequential in the bitmap.
+*/
+   list_length = get_unaligned_be32([0]) + 64;
+   rec = buf + 64;
+   buf_len = min(list_length, SD_ZBC_BUF_SIZE);
+   while (rec < buf + buf_len) {
+   type = rec[0] & 0x0f;
+   cond = (rec[1] >> 4) & 0xf;
+   if (type != ZBC_ZONE_TYPE_CONV &&
+   cond != ZBC_ZONE_COND_READONLY &&
+   cond != ZBC_ZONE_COND_OFFLINE)
+   set_bit(n, seq_zones);
+   block = get_unaligned_be64([8]) +
+   get_unaligned_be64([16]);
+   rec += 64;
+   n++;
+   }
+
+   }
+
+   if (n != sdkp->nr_zones) {
+   /* Something was wrong */
+   ret = -EIO;
+   }
+
+out:
+   kfree(buf);
+   if (ret) {
+   kfree(seq_zones);
+   return ret;
+   }
+
+   q->zoned.seq_zones = seq_zones;
+
+   return 0;
+}
+
+static void sd_zbc_cleanup(struct scsi_disk *sdkp)
+{
+   struct request_queue *q = sdkp->disk->queue;
+
+   kfree(q->zoned.seq_zones);
+   q->zoned.seq_zones = NULL;
+
+   kfree(sdkp->zones_wlock);
+   sdkp->zones_wlock = NULL;
+}
+
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
+   struct request_queue *q = sdkp->disk->queue;
+   int ret;
 
/* READ16/WRITE16 is mandatory for ZBC disks */
sdkp->device->use_16_for_rw = 1;
@@ -547,14 +642,28 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
sdkp->nr_zones =
round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
+   /*
+* Wait for the disk capacity to stabilize before
+* initializing zone related information.
+*/
+   if (sdkp->first_scan)
+   return 0;
+
if (!sdkp->zones_wlock) {
-   sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-   sizeof(unsigned long),
-   GFP_KERNEL);
+   sdkp->zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
if (!sdkp->zones_wlock)
return -ENOMEM;
}
 
+   q->zoned.nr_zones = sdkp->nr_zones;
+   if (!q->zoned.seq_zones) {
+   ret = sd_zbc_setup_seq_zones(sdkp);
+   if (ret) {
+   sd_zbc_cleanup(sdkp);
+   return ret;
+   }
+   }
+
return 0;
 }
 
@@ -609,14 +718,14 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)

[PATCH V3 11/12] scsi: sd_zbc: Disable zone write locking with scsi-mq

2017-09-15 Thread Damien Le Moal

In the case of a ZBC disk used with scsi-mq, zone write locking does
not prevent write reordering in sequential zones. Unlike the legacy
case, zone locking is done after the command request is removed from
the scheduler dispatch queue. That is, at the time of zone locking,
the write command may already be out of order, making locking
ineffective. Write order guarantees can only be provided by an
adapted I/O scheduler.

Disable zone write locking in sd_zbc_write_lock_zone() if the disk is
used with scsi-mq. As the disk zones_wlock bitmap is not necessry,
do not allocate it.

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
---
 drivers/scsi/sd_zbc.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index c32a3a3d9138..de504026f2c0 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -279,7 +279,7 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
sector_t zone_sectors = sd_zbc_zone_sectors(sdkp);
-   unsigned int zno = sd_zbc_zone_no(sdkp, sector);
+   unsigned int zno;
 
/*
 * Note: Checks of the alignment of the write command on
@@ -291,6 +291,10 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
(sector & (zone_sectors - 1)) + blk_rq_sectors(rq) > zone_sectors)
return BLKPREP_KILL;
 
+   /* No write locking with scsi-mq */
+   if (q->mq_ops)
+   return BLKPREP_OK;
+
/*
 * There is no write constraints on conventional zones. So any write
 * command can be sent. But do not issue more than one write command
@@ -298,6 +302,7 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 * due to the unlocking of the request queue in the dispatch path of
 * legacy scsi path, as well as at the HBA level (e.g. AHCI).
 */
+   zno = sd_zbc_zone_no(sdkp, sector);
if (q->zoned.seq_zones && test_bit(zno, q->zoned.seq_zones))
return BLKPREP_OK;
if (sdkp->zones_wlock && test_and_set_bit(zno, sdkp->zones_wlock))
@@ -653,7 +658,7 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
if (sdkp->first_scan)
return 0;
 
-   if (!sdkp->zones_wlock) {
+   if (!q->mq_ops && !sdkp->zones_wlock) {
sdkp->zones_wlock = sd_zbc_alloc_zone_bitmap(sdkp);
if (!sdkp->zones_wlock)
return -ENOMEM;
-- 
2.13.5

[PATCH V3 10/12] scsi: sd_zbc: Limit zone write locking to sequential zones

2017-09-15 Thread Damien Le Moal

Zoned block devices have no write constraints for conventional zones.
So write locking of conventional zones is not necessary and can even
hurt performance by unnecessarily operating the disk under low queue
depth. To avoid this, use the disk request queue seq_zones bitmap to
allow any write to be issued to conventional zones, locking only
sequential zones.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/sd_zbc.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 534e747e8bcf..c32a3a3d9138 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -275,6 +275,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
+   struct request_queue *q = rq->q;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
sector_t sector = blk_rq_pos(rq);
sector_t zone_sectors = sd_zbc_zone_sectors(sdkp);
@@ -286,18 +287,20 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 */
 
/* Do not allow zone boundaries crossing on host-managed drives */
-   if (blk_queue_zoned_model(sdkp->disk->queue) == BLK_ZONED_HM &&
+   if (blk_queue_zoned_model(q) == BLK_ZONED_HM &&
(sector & (zone_sectors - 1)) + blk_rq_sectors(rq) > zone_sectors)
return BLKPREP_KILL;
 
/*
-* Do not issue more than one write at a time per
-* zone. This solves write ordering problems due to
-* the unlocking of the request queue in the dispatch
-* path in the non scsi-mq case.
+* There is no write constraints on conventional zones. So any write
+* command can be sent. But do not issue more than one write command
+* at a time per sequential zone. This avoids write ordering problems
+* due to the unlocking of the request queue in the dispatch path of
+* legacy scsi path, as well as at the HBA level (e.g. AHCI).
 */
-   if (sdkp->zones_wlock &&
-   test_and_set_bit(zno, sdkp->zones_wlock))
+   if (q->zoned.seq_zones && test_bit(zno, q->zoned.seq_zones))
+   return BLKPREP_OK;
+   if (sdkp->zones_wlock && test_and_set_bit(zno, sdkp->zones_wlock))
return BLKPREP_DEFER;
 
WARN_ON_ONCE(cmd->flags & SCMD_ZONE_WRITE_LOCK);
@@ -316,8 +319,9 @@ void sd_zbc_write_unlock_zone(struct scsi_cmnd *cmd)
struct request *rq = cmd->request;
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
 
-   if (sdkp->zones_wlock && cmd->flags & SCMD_ZONE_WRITE_LOCK) {
+   if (cmd->flags & SCMD_ZONE_WRITE_LOCK) {
unsigned int zno = sd_zbc_zone_no(sdkp, blk_rq_pos(rq));
+
WARN_ON_ONCE(!test_bit(zno, sdkp->zones_wlock));
cmd->flags &= ~SCMD_ZONE_WRITE_LOCK;
clear_bit_unlock(zno, sdkp->zones_wlock);
-- 
2.13.5

[PATCH V3 07/12] scsi: sd_zbc: Use well defined macros

2017-09-15 Thread Damien Le Moal

instead of open coding, use the min() macro to calculate a report zones
reply buffer length in sd_zbc_check_zone_size() and the round_up()
macro for calculating the number of zones in sd_zbc_setup().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Johannes Thumshirn 
---
 drivers/scsi/sd_zbc.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 081bf5a3b10b..37b33adbb2ef 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -430,7 +430,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf)
return 0;
 }
 
-#define SD_ZBC_BUF_SIZE 131072
+#define SD_ZBC_BUF_SIZE 131072U
 
 static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 {
@@ -474,10 +474,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
/* Parse REPORT ZONES header */
list_length = get_unaligned_be32([0]) + 64;
rec = buf + 64;
-   if (list_length < SD_ZBC_BUF_SIZE)
-   buf_len = list_length;
-   else
-   buf_len = SD_ZBC_BUF_SIZE;
+   buf_len = min(list_length, SD_ZBC_BUF_SIZE);
 
/* Parse zone descriptors */
while (rec < buf + buf_len) {
@@ -547,9 +544,8 @@ static int sd_zbc_setup(struct scsi_disk *sdkp)
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
-   if (sdkp->capacity & (sdkp->zone_blocks - 1))
-   sdkp->nr_zones++;
+   sdkp->nr_zones =
+   round_up(sdkp->capacity, sdkp->zone_blocks) >> sdkp->zone_shift;
 
if (!sdkp->zones_wlock) {
sdkp->zones_wlock = kcalloc(BITS_TO_LONGS(sdkp->nr_zones),
-- 
2.13.5

[PATCH V3 08/12] scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()

2017-09-15 Thread Damien Le Moal

The three values starting at byte 8 of the Zoned Block Device
Characteristics VPD page B6h are 32 bits values, not 64bits. So use
get_unaligned_be32() to retrieve the values and not get_unaligned_be64()

Fixes: 89d947561077 ("sd: Implement support for ZBC devices")
Cc: 

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
---
 drivers/scsi/sd_zbc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 37b33adbb2ef..7e30d0c22930 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -385,15 +385,15 @@ static int sd_zbc_read_zoned_characteristics(struct 
scsi_disk *sdkp,
if (sdkp->device->type != TYPE_ZBC) {
/* Host-aware */
sdkp->urswrz = 1;
-   sdkp->zones_optimal_open = get_unaligned_be64([8]);
-   sdkp->zones_optimal_nonseq = get_unaligned_be64([12]);
+   sdkp->zones_optimal_open = get_unaligned_be32([8]);
+   sdkp->zones_optimal_nonseq = get_unaligned_be32([12]);
sdkp->zones_max_open = 0;
} else {
/* Host-managed */
sdkp->urswrz = buf[4] & 1;
sdkp->zones_optimal_open = 0;
sdkp->zones_optimal_nonseq = 0;
-   sdkp->zones_max_open = get_unaligned_be64([16]);
+   sdkp->zones_max_open = get_unaligned_be32([16]);
}
 
return 0;
-- 
2.13.5

[PATCH V3 06/12] scsi: sd_zbc: Rearrange code

2017-09-15 Thread Damien Le Moal

Rearrange sd_zbc_setup() to include use_16_for_rw and use_10_for_rw
assignments and move the calculation of sdkp->zone_shift together
with the assignment of the verified zone_blocks value in
sd_zbc_check_zone_size().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/sd_zbc.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index fd29717b6eab..081bf5a3b10b 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -532,6 +532,7 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
}
 
sdkp->zone_blocks = zone_blocks;
+   sdkp->zone_shift = ilog2(zone_blocks);
 
return 0;
 }
@@ -539,10 +540,13 @@ static int sd_zbc_check_zone_size(struct scsi_disk *sdkp)
 static int sd_zbc_setup(struct scsi_disk *sdkp)
 {
 
+   /* READ16/WRITE16 is mandatory for ZBC disks */
+   sdkp->device->use_16_for_rw = 1;
+   sdkp->device->use_10_for_rw = 0;
+
/* chunk_sectors indicates the zone size */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, sdkp->zone_blocks));
-   sdkp->zone_shift = ilog2(sdkp->zone_blocks);
sdkp->nr_zones = sdkp->capacity >> sdkp->zone_shift;
if (sdkp->capacity & (sdkp->zone_blocks - 1))
sdkp->nr_zones++;
@@ -605,10 +609,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned 
char *buf)
if (ret)
goto err;
 
-   /* READ16/WRITE16 is mandatory for ZBC disks */
-   sdkp->device->use_16_for_rw = 1;
-   sdkp->device->use_10_for_rw = 0;
-
return 0;
 
 err:
-- 
2.13.5

[PATCH V3 01/12] block: Fix declaration of blk-mq debugfs functions

2017-09-15 Thread Damien Le Moal

__blk_mq_debugfs_rq_show() and blk_mq_debugfs_rq_show() are exported
symbols but ar eonly declared in the block internal file
block/blk-mq-debugfs.h. which is not cleanly accessible to files outside
of the block directory.
Move the declaration of these functions to the new file
include/linux/blk-mq-debugfs.h file to make the declarations cleanly
available to other modules.

While at it, also move the definition of the blk_mq_debugfs_attr
structure to allow scheduler modules outside of the block directory to
define debugfs attributes.

Signed-off-by: Damien Le Moal 
Reviewed-by: Johannes Thumshirn 
---
 block/blk-mq-debugfs.h | 14 +-
 include/linux/blk-mq-debugfs.h | 23 +++
 2 files changed, 24 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/blk-mq-debugfs.h

diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a182e6f97565..4ffeae84b83a 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -3,19 +3,7 @@
 
 #ifdef CONFIG_BLK_DEBUG_FS
 
-#include 
-
-struct blk_mq_debugfs_attr {
-   const char *name;
-   umode_t mode;
-   int (*show)(void *, struct seq_file *);
-   ssize_t (*write)(void *, const char __user *, size_t, loff_t *);
-   /* Set either .show or .seq_ops. */
-   const struct seq_operations *seq_ops;
-};
-
-int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
-int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
+#include 
 
 int blk_mq_debugfs_register(struct request_queue *q);
 void blk_mq_debugfs_unregister(struct request_queue *q);
diff --git a/include/linux/blk-mq-debugfs.h b/include/linux/blk-mq-debugfs.h
new file mode 100644
index ..211537f61dce
--- /dev/null
+++ b/include/linux/blk-mq-debugfs.h
@@ -0,0 +1,23 @@
+#ifndef BLK_MQ_DEBUGFS_H
+#define BLK_MQ_DEBUGFS_H
+
+#ifdef CONFIG_BLK_DEBUG_FS
+
+#include 
+#include 
+
+struct blk_mq_debugfs_attr {
+   const char *name;
+   umode_t mode;
+   int (*show)(void *, struct seq_file *);
+   ssize_t (*write)(void *, const char __user *, size_t, loff_t *);
+   /* Set either .show or .seq_ops. */
+   const struct seq_operations *seq_ops;
+};
+
+int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
+int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
+
+#endif
+
+#endif
-- 
2.13.5

[PATCH V3 05/12] scsi: sd_zbc: Fix comments and indentation

2017-09-15 Thread Damien Le Moal

Fix comments style (do not use documented comment style) and add some
comments to clarify some functions. Also fix some functions signature
indentation and remove a useless blank line in sd_zbc_read_zones().

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
---
 drivers/scsi/scsi_lib.c |  5 +++-
 drivers/scsi/sd_zbc.c   | 67 -
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9cf6a80fe297..c72b97a74906 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1752,7 +1752,10 @@ static void scsi_done(struct scsi_cmnd *cmd)
  *
  * Returns: Nothing
  *
- * Lock status: IO request lock assumed to be held when called.
+ * Lock status: request queue lock assumed to be held when called.
+ *
+ * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
+ * protection for ZBC disks.
  */
 static void scsi_request_fn(struct request_queue *q)
__releases(q->queue_lock)
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 692c8cbc7ed8..fd29717b6eab 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -31,11 +31,11 @@
 
 #include "sd.h"
 
-/**
- * Convert a zone descriptor to a zone struct.
+/*
+ * Convert a zone descriptor to a struct blk_zone,
+ * taking care of converting LBA sized values to 512B sectors.
  */
-static void sd_zbc_parse_report(struct scsi_disk *sdkp,
-   u8 *buf,
+static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf,
struct blk_zone *zone)
 {
struct scsi_device *sdp = sdkp->device;
@@ -57,8 +57,9 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp,
zone->wp = zone->start + zone->len;
 }
 
-/**
+/*
  * Issue a REPORT ZONES scsi command.
+ * For internal use during device validation.
  */
 static int sd_zbc_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
   unsigned int buflen, sector_t lba)
@@ -99,6 +100,9 @@ static int sd_zbc_report_zones(struct scsi_disk *sdkp, 
unsigned char *buf,
return 0;
 }
 
+/*
+ * Prepare a REPORT ZONES scsi command for a REQ_OP_ZONE_REPORT request.
+ */
 int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -141,6 +145,11 @@ int sd_zbc_setup_report_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/*
+ * Process a REPORT ZONES scsi command reply, converting all reported
+ * zone descriptors to struct blk_zone. The conversion is done in-place,
+ * directly in the request specified sg buffer.
+ */
 static void sd_zbc_report_zones_complete(struct scsi_cmnd *scmd,
 unsigned int good_bytes)
 {
@@ -196,17 +205,26 @@ static void sd_zbc_report_zones_complete(struct scsi_cmnd 
*scmd,
local_irq_restore(flags);
 }
 
+/*
+ * Zone size in number of 512B sectors.
+ */
 static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
 {
return logical_to_sectors(sdkp->device, sdkp->zone_blocks);
 }
 
+/*
+ * Zone number of the specified sector.
+ */
 static inline unsigned int sd_zbc_zone_no(struct scsi_disk *sdkp,
  sector_t sector)
 {
return sectors_to_logical(sdkp->device, sector) >> sdkp->zone_shift;
 }
 
+/*
+ * Prepare a RESET WRITE POINTER scsi command for a REQ_OP_ZONE_RESET request.
+ */
 int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -239,6 +257,21 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
return BLKPREP_OK;
 }
 
+/*
+ * Write lock a sequential zone. Called from sd_init_cmd() for write requests
+ * (standard write, write same or write zeroes operations).
+ * If the request target zone is not already locked, the zone is locked
+ * and BLKPREP_OK returned, allowing the request to proceed through dispatch in
+ * scsi_request_fn(). Otherwise, BLKPREP_DEFER is returned, forcing the request
+ * to wait for the zone to be unlocked, that is, for the previously issued 
write
+ * request targeting the same zone to complete.
+ *
+ * This is called from blk_peek_request() context with the
+ * queue lock held and before the request is removed from the scheduler.
+ * As a result, multiple contexts executing concurrently scsi_request_fn()
+ * cannot result in write sequence reordering as only a single write request
+ * per zone is allowed to proceed.
+ */
 int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 {
struct request *rq = cmd->request;
@@ -261,10 +294,7 @@ int sd_zbc_write_lock_zone(struct scsi_cmnd *cmd)
 * Do not issue more than one write at a time per
 * zone. This solves write ordering problems due to
 * the unlocking of the request queue in the dispatch
-* path in the non scsi-mq case. For scsi-mq, this
-* also avoids potential write

[PATCH V3 03/12] block: Add zoned block device information to request queue

2017-09-15 Thread Damien Le Moal

Components relying only on the requeuest_queue structure for managing
and controlling block devices (e.g. I/O schedulers) have a limited
view/knowledged of the device being controlled. For instance, the device
capacity cannot be known easily, which for a zoned block device also
result in the inability to know the number of zones of the device.

Define the structure blk_zoned and include it as the zoned field of a
request queue to allow low level drivers to provide more information
on the device.

Signed-off-by: Damien Le Moal 
---
 include/linux/blkdev.h | 24 
 1 file changed, 24 insertions(+)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 460294bb0fa5..297e0a2fee31 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -349,6 +349,11 @@ struct queue_limits {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
+struct blk_zoned {
+   unsigned intnr_zones;
+   unsigned long   *seq_zones;
+};
+
 struct blk_zone_report_hdr {
unsigned intnr_zones;
u8  padding[60];
@@ -492,6 +497,10 @@ struct request_queue {
struct blk_integrity integrity;
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+#ifdef CONFIG_BLK_DEV_ZONED
+   struct blk_zonedzoned;
+#endif
+
 #ifdef CONFIG_PM
struct device   *dev;
int rpm_status;
@@ -785,6 +794,11 @@ static inline unsigned int blk_queue_zone_sectors(struct 
request_queue *q)
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
+static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+{
+   return blk_queue_is_zoned(q) ? q->zoned.nr_zones : 0;
+}
+
 static inline bool rq_is_sync(struct request *rq)
 {
return op_is_sync(rq->cmd_flags);
@@ -1582,6 +1596,16 @@ static inline unsigned int bdev_zone_sectors(struct 
block_device *bdev)
return 0;
 }
 
+static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+{
+   struct request_queue *q = bdev_get_queue(bdev);
+
+   if (q)
+   return blk_queue_nr_zones(q);
+
+   return 0;
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
return q ? q->dma_alignment : 511;
-- 
2.13.5

[PATCH V3 04/12] scsi: sd_zbc: Move ZBC declarations to scsi_proto.h

2017-09-15 Thread Damien Le Moal

Move standard macro definitions for the zone types and zone conditions
to scsi_proto.h together with the definitions related to the
REPORT ZONES command. While at it, define all values in the enums to
be clear.

Also remove unnecessary includes in sd_zbc.c.

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal 
Reviewed-by: Bart Van Assche 
Reviewed-by: Johannes Thumshirn 
---
 drivers/scsi/sd_zbc.c | 24 
 include/scsi/scsi_proto.h | 45 ++---
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 8aa54779aac1..692c8cbc7ed8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -28,32 +28,8 @@
 
 #include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
 
 #include "sd.h"
-#include "scsi_priv.h"
-
-enum zbc_zone_type {
-   ZBC_ZONE_TYPE_CONV = 0x1,
-   ZBC_ZONE_TYPE_SEQWRITE_REQ,
-   ZBC_ZONE_TYPE_SEQWRITE_PREF,
-   ZBC_ZONE_TYPE_RESERVED,
-};
-
-enum zbc_zone_cond {
-   ZBC_ZONE_COND_NO_WP,
-   ZBC_ZONE_COND_EMPTY,
-   ZBC_ZONE_COND_IMP_OPEN,
-   ZBC_ZONE_COND_EXP_OPEN,
-   ZBC_ZONE_COND_CLOSED,
-   ZBC_ZONE_COND_READONLY = 0xd,
-   ZBC_ZONE_COND_FULL,
-   ZBC_ZONE_COND_OFFLINE,
-};
 
 /**
  * Convert a zone descriptor to a zone struct.
diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h
index 8c285d9a06d8..39130a9c05bf 100644
--- a/include/scsi/scsi_proto.h
+++ b/include/scsi/scsi_proto.h
@@ -301,19 +301,42 @@ struct scsi_lun {
 
 /* Reporting options for REPORT ZONES */
 enum zbc_zone_reporting_options {
-   ZBC_ZONE_REPORTING_OPTION_ALL = 0,
-   ZBC_ZONE_REPORTING_OPTION_EMPTY,
-   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN,
-   ZBC_ZONE_REPORTING_OPTION_CLOSED,
-   ZBC_ZONE_REPORTING_OPTION_FULL,
-   ZBC_ZONE_REPORTING_OPTION_READONLY,
-   ZBC_ZONE_REPORTING_OPTION_OFFLINE,
-   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
-   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE,
-   ZBC_ZONE_REPORTING_OPTION_NON_WP = 0x3f,
+   ZBC_ZONE_REPORTING_OPTION_ALL   = 0x00,
+   ZBC_ZONE_REPORTING_OPTION_EMPTY = 0x01,
+   ZBC_ZONE_REPORTING_OPTION_IMPLICIT_OPEN = 0x02,
+   ZBC_ZONE_REPORTING_OPTION_EXPLICIT_OPEN = 0x03,
+   ZBC_ZONE_REPORTING_OPTION_CLOSED= 0x04,
+   ZBC_ZONE_REPORTING_OPTION_FULL  = 0x05,
+   ZBC_ZONE_REPORTING_OPTION_READONLY  = 0x06,
+   ZBC_ZONE_REPORTING_OPTION_OFFLINE   = 0x07,
+   /* 0x08 to 0x0f are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NEED_RESET_WP = 0x10,
+   ZBC_ZONE_REPORTING_OPTION_NON_SEQWRITE  = 0x11,
+   /* 0x12 to 0x3e are reserved */
+   ZBC_ZONE_REPORTING_OPTION_NON_WP= 0x3f,
 };
 
 #define ZBC_REPORT_ZONE_PARTIAL 0x80
 
+/* Zone types of REPORT ZONES zone descriptors */
+enum zbc_zone_type {
+   ZBC_ZONE_TYPE_CONV  = 0x1,
+   ZBC_ZONE_TYPE_SEQWRITE_REQ  = 0x2,
+   ZBC_ZONE_TYPE_SEQWRITE_PREF = 0x3,
+   /* 0x4 to 0xf are reserved */
+};
+
+/* Zone conditions of REPORT ZONES zone descriptors */
+enum zbc_zone_cond {
+   ZBC_ZONE_COND_NO_WP = 0x0,
+   ZBC_ZONE_COND_EMPTY = 0x1,
+   ZBC_ZONE_COND_IMP_OPEN  = 0x2,
+   ZBC_ZONE_COND_EXP_OPEN  = 0x3,
+   ZBC_ZONE_COND_CLOSED= 0x4,
+   /* 0x5 to 0xc are reserved */
+   ZBC_ZONE_COND_READONLY  = 0xd,
+   ZBC_ZONE_COND_FULL  = 0xe,
+   ZBC_ZONE_COND_OFFLINE   = 0xf,
+};
+
 #endif /* _SCSI_PROTO_H_ */
-- 
2.13.5

[PATCH V3 00/12] scsi-mq support for ZBC disks

2017-09-15 Thread Damien Le Moal

This series implements support for ZBC disks used through the scsi-mq I/O path.

The current scsi level support of ZBC disks guarantees write request ordering
using a per-zone write lock which prevents issuing simultaneously multiple
write commands to a zone, doing so avoid reordering of sequential writes to
sequential zones. This method is however ineffective when scsi-mq is used with
zoned block devices. This is due to the different execution model of blk-mq
which passes a request to the scsi layer for dispatching after the request has
been removed from the I/O scheduler queue. That is, when the scsi layer tries
to lock the target zone of the request, the request may already be out of
order and zone write locking fails to prevent that.

Various approaches have been tried to solve this problem. All of them had the
serious disadvantage of cluttering blk-mq code with zoned block device specific
conditions and processing. As such extensive changes can only turn into a
maintenance nightmares, a radically different solution is proposed here.

This series proposes implementing scsi-mq support for zoned block devices in
the form of a new "zoned" I/O scheduler based. The zoned I/O scheduler is
based on mq-deadline, with most of the code reused with no modification other
than name changes. Zoned only differs from mq-deadline from the addition of a
per zone write locking mechanism similar to that implemented in sd_zbc.c. The
zoned scheduler zone write locking mechanism is used for the exact same purpose
as the one in the scsi disk driver, that is, to limit writes per zone to at
most one request to avoid reordering. The locking context however changes from
that of scsi-mq and is moved to the dispatch_request method of the scheduler.
Within this context, under a spin lock guaranteeing atomicity against other
dispatch contexts, target zones of write requests can be locked before write
requests removal from the scheduler. In effect, this results in the same
behavior as the legacy scsi path. Sequential write ordering is preserved.

The zoned scheduler is also optimized to not lock conventional zones. To do so,
the new data structure blk_zoned is added to the request queue structure so
that the low level scsi code can pass information such as number of zones and
zone types to the block layer scheduler. This avoids difficulties in accessing
this information from the I/O scheduler initialization method (init_queue()
method) context.

The series patches are as follows:
- The first 2 patches fix exported symbols declaration to allow compiling
  external I/O schedulers as modules. No functional changes are introduced.
- Patch 3 introduces the new blk_zoned data structure added to request queues
- Pathes 4 to 7 reorganize and cleanup the scsi ZBC support
- Path 8 is a bug fix
- Path 9 implements the initialization of the blk_zoned structure for zoned
  block devices
- Patch 10 is an optimization for the legacy scsi path which avoids zone
  locking if a write request targets a conventional zone. 
- Path 11 disables zone write locking for disks accessed through scsi-mq.
- Patch 12 introduces the new zoned blk-mq I/O scheduler.

Comments are as always very much appreciated.

Changes from v2:
* Introduced blk_zoned structure
* Moved I/O scheduler from drivers/scsi to block 

Changes from v1:
* Addressed Bart's comments for the blk-mq patches (declarations files)
* Split (former) patch 4 into multiple patches to facilitate review
* Fixed scsi disk lookup from io scheduler by introducing
  scsi_disk_from_queue()

Damien Le Moal (12):
  block: Fix declaration of blk-mq debugfs functions
  block: Fix declaration of blk-mq scheduler functions
  block: Add zoned block device information to request queue
  scsi: sd_zbc: Move ZBC declarations to scsi_proto.h
  scsi: sd_zbc: Fix comments and indentation
  scsi: sd_zbc: Rearrange code
  scsi: sd_zbc: Use well defined macros
  scsi: sd_zbc: Fix sd_zbc_read_zoned_characteristics()
  scsi: sd_zbc: Initialize device queue zoned structure
  scsi: sd_zbc: Limit zone write locking to sequential zones
  scsi: sd_zbc: Disable zone write locking with scsi-mq
  block: Introduce zoned I/O scheduler

 Documentation/block/zoned-iosched.txt |  48 ++
 block/Kconfig.iosched |  12 +
 block/Makefile|   1 +
 block/blk-mq-debugfs.h|  14 +-
 block/blk-mq-sched.h  |  11 +-
 block/zoned-iosched.c | 925 ++
 drivers/scsi/scsi_lib.c   |   5 +-
 drivers/scsi/sd_zbc.c | 267 +++---
 include/linux/blk-mq-debugfs.h|  23 +
 include/linux/blk-mq-sched.h  |  14 +
 include/linux/blkdev.h|  24 +
 include/scsi/scsi_proto.h |  45 +-
 12 files changed, 1283 insertions(+), 106 deletions(-)
 create mode 100644 Documentation/block/zoned-iosched.txt
 create mode 100644 block/zoned-iosched.c
 create mode 100644 include/linux/blk-mq-debugfs.h

[PATCH V3 02/12] block: Fix declaration of blk-mq scheduler functions

2017-09-15 Thread Damien Le Moal

The functions blk_mq_sched_free_hctx_data(), blk_mq_sched_try_merge(),
blk_mq_sched_try_insert_merge() and blk_mq_sched_request_inserted() are
all exported symbols but are declared only internally in
block/blk-mq-sched.h. Move these declarations to the new file
include/linux/blk-mq-sched.h to make them available to block scheduler
modules implemented outside of the block directory.

Signed-off-by: Damien Le Moal 
Reviewed-by: Johannes Thumshirn 
---
 block/blk-mq-sched.h | 11 +++
 include/linux/blk-mq-sched.h | 14 ++
 2 files changed, 17 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/blk-mq-sched.h

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9267d0b7c197..f48f3c8edcb3 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -1,19 +1,14 @@
-#ifndef BLK_MQ_SCHED_H
-#define BLK_MQ_SCHED_H
+#ifndef INT_BLK_MQ_SCHED_H
+#define INT_BLK_MQ_SCHED_H
 
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
-void (*exit)(struct blk_mq_hw_ctx *));
+#include 
 
 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio);
 
-void blk_mq_sched_request_inserted(struct request *rq);
-bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-   struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request 
*rq);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
diff --git a/include/linux/blk-mq-sched.h b/include/linux/blk-mq-sched.h
new file mode 100644
index ..8ae1992e02c6
--- /dev/null
+++ b/include/linux/blk-mq-sched.h
@@ -0,0 +1,14 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+/*
+ * Scheduler helper functions.
+ */
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+void (*exit)(struct blk_mq_hw_ctx *));
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+   struct request **merged_request);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request 
*rq);
+
+#endif
-- 
2.13.5

[PATCH] block: consider merge of segments when merge bio into rq

2017-09-15 Thread Jianchao Wang

When account the nr_phys_segments during merging bios into rq,
only consider segments merging in individual bio but not all
the bios in a rq. This leads to the bigger nr_phys_segments of
rq than the real one when the segments of bios in rq are
contiguous and mergeable. The nr_phys_segments of rq will exceed
max_segmets of q while the sectors of rq maybe far away from
the max_sectors of q.

Consider the segments merge when account nr_phys_segments of rq
during merging bio into rq. Promote the merging of small and
contiguous IO. In addition, it could eliminate the wasting of
scatterlist structure.

Signed-off-by: Jianchao Wang 
---
 block/blk-merge.c | 98 ---
 1 file changed, 64 insertions(+), 34 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 14b6e37..b2f54fd 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -472,54 +472,60 @@ int blk_rq_map_sg(struct request_queue *q, struct request 
*rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
-static inline int ll_new_hw_segment(struct request_queue *q,
-   struct request *req,
-   struct bio *bio)
-{
-   int nr_phys_segs = bio_phys_segments(q, bio);
-
-   if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
-   goto no_merge;
-
-   if (blk_integrity_merge_bio(q, req, bio) == false)
-   goto no_merge;
-
-   /*
-* This will form the start of a new hw segment.  Bump both
-* counters.
-*/
-   req->nr_phys_segments += nr_phys_segs;
-   return 1;
-
-no_merge:
-   req_set_nomerge(q, req);
-   return 0;
-}
-
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 struct bio *bio)
 {
+   unsigned int seg_size;
+   int total_nr_phys_segs;
+   bool contig;
+
if (req_gap_back_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
integrity_req_gap_back_merge(req, bio))
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
-   blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
-   req_set_nomerge(q, req);
-   return 0;
-   }
+   blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+   goto no_merge;
+
if (!bio_flagged(req->biotail, BIO_SEG_VALID))
blk_recount_segments(q, req->biotail);
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
 
-   return ll_new_hw_segment(q, req, bio);
+   if (blk_integrity_merge_bio(q, req, bio) == false)
+   goto no_merge;
+
+   seg_size = req->biotail->bi_seg_back_size + bio->bi_seg_front_size;
+   total_nr_phys_segs = req->nr_phys_segments + bio_phys_segments(q, bio);
+
+   contig = blk_phys_contig_segment(q, req->biotail, bio);
+   if (contig)
+   total_nr_phys_segs--;
+
+   if (unlikely(total_nr_phys_segs > queue_max_segments(q)))
+   goto no_merge;
+
+   if (contig) {
+   if (req->nr_phys_segments == 1)
+   req->bio->bi_seg_front_size = seg_size;
+   if (bio->bi_phys_segments == 1)
+   bio->bi_seg_back_size = seg_size;
+   }
+   req->nr_phys_segments = total_nr_phys_segs;
+   return 1;
+
+no_merge:
+   req_set_nomerge(q, req);
+   return 0;
 }
 
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
  struct bio *bio)
 {
+   unsigned int seg_size;
+   int total_nr_phys_segs;
+   bool contig;
 
if (req_gap_front_merge(req, bio))
return 0;
@@ -527,16 +533,40 @@ int ll_front_merge_fn(struct request_queue *q, struct 
request *req,
integrity_req_gap_front_merge(req, bio))
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
-   blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
-   req_set_nomerge(q, req);
-   return 0;
-   }
+   blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector))
+   goto no_merge;
+
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
if (!bio_flagged(req->bio, BIO_SEG_VALID))
blk_recount_segments(q, req->bio);
 
-   return ll_new_hw_segment(q, req, bio);
+   if (blk_integrity_merge_bio(q, req, bio) == false)
+   goto no_merge;
+
+   seg_size = req->bio->bi_seg_front_size + bio->bi_seg_back_size;
+   total_nr_phys_segs = req->nr_phys_segments + bio_phys_segments(q, bio);
+
+   contig = blk_phys_contig_segment(q, bio, req->bio);
+   if (contig)
+   total_nr_phys_segs--;
+
+   if (unlikely(total_nr_phys_segs > queue_max_segments(q)))
+   goto no_merge;
+
+   if (contig) {
+   if

[PATCH] block: move sanity checking ahead of bi_front/back_seg_size updating

2017-09-15 Thread Jianchao Wang

If the bio_integrity_merge_rq() return false or nr_phys_segments exceeds
the max_segments, the merging fails, but the bi_front/back_seg_size may
have been modified. To avoid it, move the sanity checking ahead.

Signed-off-by: Jianchao Wang 
---
 block/blk-merge.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99038830..14b6e37 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -553,6 +553,7 @@ static bool req_no_special_merge(struct request *req)
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
 {
+   bool contig;
int total_phys_segments;
unsigned int seg_size =
req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
@@ -575,13 +576,9 @@ static int ll_merge_requests_fn(struct request_queue *q, 
struct request *req,
return 0;
 
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-   if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
-   if (req->nr_phys_segments == 1)
-   req->bio->bi_seg_front_size = seg_size;
-   if (next->nr_phys_segments == 1)
-   next->biotail->bi_seg_back_size = seg_size;
+   contig = blk_phys_contig_segment(q, req->biotail, next->bio);
+   if (contig)
total_phys_segments--;
-   }
 
if (total_phys_segments > queue_max_segments(q))
return 0;
@@ -589,6 +586,13 @@ static int ll_merge_requests_fn(struct request_queue *q, 
struct request *req,
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
 
+   if (contig) {
+   if (req->nr_phys_segments == 1)
+   req->bio->bi_seg_front_size = seg_size;
+   if (next->nr_phys_segments == 1)
+   next->biotail->bi_seg_back_size = seg_size;
+   }
+
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
return 1;
-- 
2.7.4

89 matches

Mail list logo