[PATCH 4/5] aio: support for IO polling

2018-11-17 Thread Jens Axboe
Add polled variants of PREAD/PREADV and PWRITE/PWRITEV. These act
like their non-polled counterparts, except we expect to poll for
completion of them. The polling happens at io_getevent() time, and
works just like non-polled IO.

Polled IO doesn't support the user mapped completion ring. Events
must be reaped through the io_getevents() system call. For non-irq
driven poll devices, there's no way to support completion reaping
from userspace by just looking at the ring. The application itself
is the one that pulls completion entries.

It's illegal to mix polled and non-polled IO on the same ioctx.
This is because we need to know at io_getevents() time if we have
any non-polled IOs pending. For polled IO, we can never sleep in
the read_events() path, since polled IO may not trigger async
completions through an IRQ. In lieu of having some way of marking
an ioctx as strictly polled or non-polled, we track this state
and return -EINVAL if the application mixes the two types.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 490 ---
 include/uapi/linux/aio_abi.h |   2 +
 2 files changed, 463 insertions(+), 29 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 3d9bc81cf500..500da3ffc376 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -86,6 +86,11 @@ struct ctx_rq_wait {
atomic_t count;
 };
 
+enum {
+   CTX_TYPE_NORMAL = 0,
+   CTX_TYPE_POLLED,
+};
+
 struct kioctx {
struct percpu_ref   users;
atomic_tdead;
@@ -96,6 +101,12 @@ struct kioctx {
 
struct __percpu kioctx_cpu *cpu;
 
+   /*
+* CTX_TYPE_ bits set. Used to not mix and match polled and
+* normal IO.
+*/
+   unsigned long   io_type;
+
/*
 * For percpu reqs_available, number of slots we move to/from global
 * counter at a time:
@@ -138,6 +149,12 @@ struct kioctx {
atomic_treqs_available;
} cacheline_aligned_in_smp;
 
+   struct {
+   spinlock_t poll_lock;
+   struct list_head poll_pending;
+   struct list_head poll_done;
+   } cacheline_aligned_in_smp;
+
struct {
spinlock_t  ctx_lock;
struct list_head active_reqs;   /* used for cancellation */
@@ -191,6 +208,9 @@ struct aio_kiocb {
 
struct list_headki_list;/* the aio core uses this
 * for cancellation */
+
+   struct list_headki_poll_list;
+
refcount_t  ki_refcnt;
 
/*
@@ -198,6 +218,12 @@ struct aio_kiocb {
 * this is the underlying eventfd context to deliver events to.
 */
struct eventfd_ctx  *ki_eventfd;
+
+   /*
+* For polled IO, stash completion info here
+*/
+   longki_poll_res;
+   longki_poll_res2;
 };
 
 /*-- sysctl variables*/
@@ -214,6 +240,8 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
+static void aio_reap_polled_events(struct kioctx *);
+
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
struct file *file;
@@ -732,6 +760,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
INIT_LIST_HEAD(>active_reqs);
 
+   spin_lock_init(>poll_lock);
+   INIT_LIST_HEAD(>poll_pending);
+   INIT_LIST_HEAD(>poll_done);
+
if (percpu_ref_init(>users, free_ioctx_users, 0, GFP_KERNEL))
goto err;
 
@@ -814,6 +846,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(>ioctx_lock);
 
+   aio_reap_polled_events(ctx);
+
/* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(>wait);
 
@@ -997,11 +1031,11 @@ static void user_refill_reqs_available(struct kioctx 
*ctx)
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
  */
-static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx, bool 
needs_ring)
 {
struct aio_kiocb *req;
 
-   if (!get_reqs_available(ctx)) {
+   if (needs_ring && !get_reqs_available(ctx)) {
user_refill_reqs_available(ctx);
if (!get_reqs_available(ctx))
return NULL;
@@ -1013,11 +1047,13 @@ static inline struct aio_kiocb *aio_get_req(struct 
kioctx *ctx)
 
percpu_ref_get(>reqs);
INIT_LIST_HEAD(>ki_list);
+   INIT_LIST_HEAD(>ki_poll_list);
refcount_set(>ki_refcnt, 0);
req->ki_ctx = ctx;
return req;
 out_put:
-   put_reqs_available(ctx, 1);
+   if (needs_ring)
+   put_reqs_available(ctx, 1);
return NULL;
 }
 
@@ -1057,6 +1093,14 @@ static 

[PATCH 5/5] aio: add support for file based polled IO

2018-11-17 Thread Jens Axboe
Needs further work, but this should work fine on normal setups
with a file system on a pollable block device.

Signed-off-by: Jens Axboe 
---
 fs/aio.c   | 2 ++
 fs/direct-io.c | 4 +++-
 fs/iomap.c | 7 +--
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 500da3ffc376..e02085fe10d7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1310,6 +1310,8 @@ static struct block_device *aio_bdev_host(struct kiocb 
*req)
 
if (S_ISBLK(inode->i_mode))
return I_BDEV(inode);
+   else if (inode->i_sb && inode->i_sb->s_bdev)
+   return inode->i_sb->s_bdev;
 
return NULL;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a5a4e5a1423e..34de494e9061 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -477,8 +477,10 @@ static inline void dio_bio_submit(struct dio *dio, struct 
dio_submit *sdio)
if (sdio->submit_io) {
sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio);
dio->bio_cookie = BLK_QC_T_NONE;
-   } else
+   } else {
dio->bio_cookie = submit_bio(bio);
+   WRITE_ONCE(dio->iocb->ki_blk_qc, dio->bio_cookie);
+   }
 
sdio->bio = NULL;
sdio->boundary = 0;
diff --git a/fs/iomap.c b/fs/iomap.c
index 74c1f37f0fd6..4cf412b6230a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1555,6 +1555,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap 
*iomap, loff_t pos,
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
+   blk_qc_t qc;
 
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
@@ -1570,7 +1571,9 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap 
*iomap, loff_t pos,
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
 
atomic_inc(>ref);
-   return submit_bio(bio);
+   qc = submit_bio(bio);
+   WRITE_ONCE(dio->iocb->ki_blk_qc, qc);
+   return qc;
 }
 
 static loff_t
@@ -1680,7 +1683,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
loff_t length,
atomic_inc(>ref);
 
dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-   dio->submit.cookie = submit_bio(bio);
+   dio->iocb->ki_blk_qc = dio->submit.cookie = submit_bio(bio);
} while (nr_pages);
 
if (need_zeroout) {
-- 
2.17.1



[PATCH 2/5] aio: fix failure to put the file pointer

2018-11-17 Thread Jens Axboe
If the ioprio capability check fails, we return without putting
the file pointer.

Fixes: d9a08a9e616b ("fs: Add aio iopriority support")
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/aio.c b/fs/aio.c
index b36691268b6c..3d9bc81cf500 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1436,6 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb 
*iocb)
ret = ioprio_check_cap(iocb->aio_reqprio);
if (ret) {
pr_debug("aio ioprio check cap error: %d\n", ret);
+   fput(req->ki_filp);
return ret;
}
 
-- 
2.17.1



[PATCHSET 0/5] Support for polled aio

2018-11-17 Thread Jens Axboe
Up until now, IO polling has been exclusively available through preadv2
and pwrite2, both fully synchronous interfaces. This works fine for
completely synchronous use cases, but that's about it. If QD=1 wasn't
enough read the performance goals, the only alternative was to increase
the thread count. Unfortunately, that isn't very efficient, both in
terms of CPU utilization (each thread will use 100% of CPU time) and in
terms of achievable performance.

With all of the recent advances in polling (non-irq polling, efficiency
gains, multiple pollable queues, etc), it's now feasible to add polling
support to aio - this patchset just does that.

An iocb flag is added, IOCB_FLAG_HIPRI, similarly to how we have
RWF_HIPRI for preadv2/pwritev2. It's applicable to the commands that
read/write data, like IOCB_CMD_PREAD/IOCB_CMD_PWRITE and the vectored
variants.  Submission works the same as before.

The polling happens off io_getevents(), when the application is looking
for completions. That also works like before, with the only difference
being that events aren't waited for, they are actively found and polled
on the device side.

The only real difference in terms of completions is that polling does
NOT use the libaio user exposed ring. This is just not feasible, as the
application needs to be the one that actively polls for the events.
Because of this, that's not supported with polling, and the internals
completely ignore the ring.

Outside of that, it's illegal to mix polled with non-polled IO on the
same io_context. There's no way to setup an io_context with the
information that we will be polling on it (always add flags to new
syscalls...), hence we need to track this internally. For polled IO, we
can never wait for events, we have to actively find them. I didn't want
to add counters to the io_context to inc/dec for each IO, so I just made
this illegal. If an application attempts to submit both polled and
non-polled IO on the same io_context, it will get an -EINVAL return at
io_submit() time.

Performance results have been very promising. For an internal Facebook
flash storage device, we're getting 20% increase in performance, with an
identical reduction in latencies. Notably, this is testing a highly
tuned setup to just turning on polling. I'm sure there's still extra
room for performance there. Note that at these speeds and feeds, the
polling ends up NOT using more CPU time than we did without polling!

On that same box, I ran microbenchmarks, and was able to increase peak
performance 25%. The box was pegged at around 2.4M IOPS, with just
turning on polling, the bandwidth was maxed out at 12.5GB/sec doing 3.2M
IOPS. All of this with 2 millions LESS interrupts/seconds, and 2M+ less
context switches.

In terms of efficiency, a tester was able to get 800K+ IOPS out of a
_single_ thread at QD=16 on a device. These kinds of results are just
unheard of in terms of efficiency.

You can find this code in my aio-poll branch, and that branch (and
these patches) are on top of my mq-perf branch.

 fs/aio.c | 495 ---
 fs/block_dev.c   |   2 +
 fs/direct-io.c   |   4 +-
 fs/iomap.c   |   7 +-
 include/linux/fs.h   |   1 +
 include/uapi/linux/aio_abi.h |   2 +
 6 files changed, 478 insertions(+), 33 deletions(-)

-- 
Jens Axboe




[PATCH 3/5] aio: add iocb->ki_blk_qc field

2018-11-17 Thread Jens Axboe
Add the field and have the blockdev direct_IO() helpers set it.
This is in preparation for being able to poll for iocb completion.

Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 2 ++
 include/linux/fs.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d233a59ea364..8a2fed18e3fc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -236,6 +236,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
bio.bi_opf |= REQ_HIPRI;
 
qc = submit_bio();
+   WRITE_ONCE(iocb->ki_blk_qc, qc);
for (;;) {
__set_current_state(TASK_UNINTERRUPTIBLE);
 
@@ -396,6 +397,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
bio->bi_opf |= REQ_HIPRI;
 
qc = submit_bio(bio);
+   WRITE_ONCE(iocb->ki_blk_qc, qc);
break;
}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c95c0807471f..032761d9b218 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -310,6 +310,7 @@ struct kiocb {
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
+   u32 ki_blk_qc;
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
-- 
2.17.1



[PATCH 1/5] aio: use assigned completion handler

2018-11-17 Thread Jens Axboe
We know this is a read/write request, but in preparation for
having different kinds of those, ensure that we call the assigned
handler instead of assuming it's aio_complete_rq().

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index 301e6314183b..b36691268b6c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1484,7 +1484,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t 
ret)
ret = -EINTR;
/*FALLTHRU*/
default:
-   aio_complete_rw(req, ret, 0);
+   req->ki_complete(req, ret, 0);
}
 }
 
-- 
2.17.1



[PATCH 1/7] block: avoid ordered task state change for polled IO

2018-11-17 Thread Jens Axboe
For the core poll helper, the task state setting don't need
to imply any atomics, as it's the current task itself that
is being modified and we're not going to sleep.

For IRQ driven, the wakeup path have the necessary barriers
to not need us using the heavy handed version of the task
state setting.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c | 4 ++--
 fs/block_dev.c | 7 +--
 fs/iomap.c | 3 ++-
 mm/page_io.c   | 3 ++-
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32b246ed44c0..7fc4abb4cc36 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3331,12 +3331,12 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
ret = q->mq_ops->poll(hctx, rq->tag);
if (ret > 0) {
hctx->poll_success++;
-   set_current_state(TASK_RUNNING);
+   __set_current_state(TASK_RUNNING);
return true;
}
 
if (signal_pending_state(state, current))
-   set_current_state(TASK_RUNNING);
+   __set_current_state(TASK_RUNNING);
 
if (current->state == TASK_RUNNING)
return true;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4d79bc80fb41..64ba27b8b754 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -237,9 +237,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
 
qc = submit_bio();
for (;;) {
-   set_current_state(TASK_UNINTERRUPTIBLE);
+   __set_current_state(TASK_UNINTERRUPTIBLE);
+
if (!READ_ONCE(bio.bi_private))
break;
+
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc))
io_schedule();
@@ -415,7 +417,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
return -EIOCBQUEUED;
 
for (;;) {
-   set_current_state(TASK_UNINTERRUPTIBLE);
+   __set_current_state(TASK_UNINTERRUPTIBLE);
+
if (!READ_ONCE(dio->waiter))
break;
 
diff --git a/fs/iomap.c b/fs/iomap.c
index b0462b363bad..c5df035ace6f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1888,7 +1888,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
return -EIOCBQUEUED;
 
for (;;) {
-   set_current_state(TASK_UNINTERRUPTIBLE);
+   __set_current_state(TASK_UNINTERRUPTIBLE);
+
if (!READ_ONCE(dio->submit.waiter))
break;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 57572ff46016..a7271fa481f6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -405,7 +405,8 @@ int swap_readpage(struct page *page, bool synchronous)
bio_get(bio);
qc = submit_bio(bio);
while (synchronous) {
-   set_current_state(TASK_UNINTERRUPTIBLE);
+   __set_current_state(TASK_UNINTERRUPTIBLE);
+
if (!READ_ONCE(bio->bi_private))
break;
 
-- 
2.17.1



[PATCH 2/7] block: have ->poll_fn() return number of entries polled

2018-11-17 Thread Jens Axboe
We currently only really support sync poll, ie poll with 1
IO in flight. This prepares us for supporting async poll.

Note that the returned value isn't necessarily 100% accurate.
If poll races with IRQ completion, we assume that the fact
that the task is now runnable means we found at least one
entry. In reality it could be more than 1, or not even 1.
This is fine, the caller will just need to take this into
account.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c| 18 +-
 drivers/nvme/host/multipath.c |  4 ++--
 include/linux/blkdev.h|  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7fc4abb4cc36..52b1c97cd7c6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,7 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -3305,7 +3305,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue 
*q,
return true;
 }
 
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
struct request_queue *q = hctx->queue;
long state;
@@ -3318,7 +3318,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
 * straight to the busy poll loop.
 */
if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
-   return true;
+   return 1;
 
hctx->poll_considered++;
 
@@ -3332,30 +3332,30 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
if (ret > 0) {
hctx->poll_success++;
__set_current_state(TASK_RUNNING);
-   return true;
+   return ret;
}
 
if (signal_pending_state(state, current))
__set_current_state(TASK_RUNNING);
 
if (current->state == TASK_RUNNING)
-   return true;
+   return 1;
if (ret < 0)
break;
cpu_relax();
}
 
__set_current_state(TASK_RUNNING);
-   return false;
+   return 0;
 }
 
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
struct blk_mq_hw_ctx *hctx;
struct request *rq;
 
if (!test_bit(QUEUE_FLAG_POLL, >queue_flags))
-   return false;
+   return 0;
 
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
if (!blk_qc_t_is_internal(cookie))
@@ -3369,7 +3369,7 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t 
cookie)
 * so we should be safe with just the NULL check.
 */
if (!rq)
-   return false;
+   return 0;
}
 
return __blk_mq_poll(hctx, rq);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index b82b0d3ca39a..65539c8df11d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,11 +220,11 @@ static blk_qc_t nvme_ns_head_make_request(struct 
request_queue *q,
return ret;
 }
 
-static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 {
struct nvme_ns_head *head = q->queuedata;
struct nvme_ns *ns;
-   bool found = false;
+   int found = 0;
int srcu_idx;
 
srcu_idx = srcu_read_lock(>srcu);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ad6eafc43f2..e97c0a3b2262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request 
*req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
-- 
2.17.1



[PATCH 5/7] blk-mq: remove 'tag' parameter from mq_ops->poll()

2018-11-17 Thread Jens Axboe
We always pass in -1 now and none of the callers use the tag value,
remove the parameter.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c   | 2 +-
 drivers/nvme/host/pci.c  | 8 
 drivers/nvme/host/rdma.c | 2 +-
 include/linux/blk-mq.h   | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ca00d712158..10a14baa04ff 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3341,7 +3341,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx)
 
hctx->poll_invoked++;
 
-   ret = q->mq_ops->poll(hctx, -1U);
+   ret = q->mq_ops->poll(hctx);
if (ret > 0) {
hctx->poll_success++;
__set_current_state(TASK_RUNNING);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1742c8ab8196..1280cc327096 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1075,14 +1075,14 @@ static int __nvme_poll(struct nvme_queue *nvmeq, 
unsigned int tag)
return found;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
 {
struct nvme_queue *nvmeq = hctx->driver_data;
 
-   return __nvme_poll(nvmeq, tag);
+   return __nvme_poll(nvmeq, -1);
 }
 
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
struct nvme_queue *nvmeq = hctx->driver_data;
u16 start, end;
@@ -1092,7 +1092,7 @@ static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, 
unsigned int tag)
return 0;
 
spin_lock(>cq_lock);
-   found = nvme_process_cq(nvmeq, , , tag);
+   found = nvme_process_cq(nvmeq, , , -1);
spin_unlock(>cq_lock);
 
nvme_complete_cqes(nvmeq, start, end);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 53e44efc6d32..4b2ba563acdb 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1741,7 +1741,7 @@ static blk_status_t nvme_rdma_queue_rq(struct 
blk_mq_hw_ctx *hctx,
return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
 {
struct nvme_rdma_queue *queue = hctx->driver_data;
struct ib_cq *cq = queue->ib_cq;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 929e8abc5535..ca0520ca6437 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -132,7 +132,7 @@ typedef void (exit_request_fn)(struct blk_mq_tag_set *set, 
struct request *,
 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
-typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (poll_fn)(struct blk_mq_hw_ctx *);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
 typedef void (complete_fn)(struct request *);
-- 
2.17.1



[PATCH 6/7] block: make blk_poll() take a parameter on whether to spin or not

2018-11-17 Thread Jens Axboe
blk_poll() has always kept spinning until it found an IO. This is
fine for SYNC polling, since we need to find one request we have
pending, but in preparation for ASYNC polling it can be beneficial
to just check if we have any entries available or not.

Existing callers are converted to pass in 'spin == true', to retain
the old behavior.

Signed-off-by: Jens Axboe 
---
 block/blk-core.c  |  4 ++--
 block/blk-mq.c| 10 +-
 drivers/nvme/host/multipath.c |  4 ++--
 drivers/nvme/target/io-cmd-bdev.c |  2 +-
 fs/block_dev.c|  4 ++--
 fs/direct-io.c|  2 +-
 fs/iomap.c|  2 +-
 include/linux/blkdev.h|  4 ++--
 mm/page_io.c  |  2 +-
 9 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0b684a520a11..ccf40f853afd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1284,14 +1284,14 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+bool blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
if (!q->poll_fn || !blk_qc_t_valid(cookie))
return false;
 
if (current->plug)
blk_flush_plug_list(current->plug, false);
-   return q->poll_fn(q, cookie);
+   return q->poll_fn(q, cookie, spin);
 }
 EXPORT_SYMBOL_GPL(blk_poll);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 10a14baa04ff..0a2847d9248b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,7 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -3328,7 +3328,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
return blk_mq_poll_hybrid_sleep(q, hctx, rq);
 }
 
-static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, bool spin)
 {
struct request_queue *q = hctx->queue;
long state;
@@ -3353,7 +3353,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx)
 
if (current->state == TASK_RUNNING)
return 1;
-   if (ret < 0)
+   if (ret < 0 || !spin)
break;
cpu_relax();
}
@@ -3362,7 +3362,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx)
return 0;
 }
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
struct blk_mq_hw_ctx *hctx;
 
@@ -3381,7 +3381,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t 
cookie)
if (blk_mq_poll_hybrid(q, hctx, cookie))
return 1;
 
-   return __blk_mq_poll(hctx);
+   return __blk_mq_poll(hctx, spin);
 }
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 65539c8df11d..c83bb3302684 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,7 +220,7 @@ static blk_qc_t nvme_ns_head_make_request(struct 
request_queue *q,
return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
 {
struct nvme_ns_head *head = q->queuedata;
struct nvme_ns *ns;
@@ -230,7 +230,7 @@ static int nvme_ns_head_poll(struct request_queue *q, 
blk_qc_t qc)
srcu_idx = srcu_read_lock(>srcu);
ns = srcu_dereference(head->current_path[numa_node_id()], >srcu);
if (likely(ns && nvme_path_is_optimized(ns)))
-   found = ns->queue->poll_fn(q, qc);
+   found = ns->queue->poll_fn(q, qc, spin);
srcu_read_unlock(>srcu, srcu_idx);
return found;
 }
diff --git a/drivers/nvme/target/io-cmd-bdev.c 
b/drivers/nvme/target/io-cmd-bdev.c
index c1ec3475a140..f6971b45bc54 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -116,7 +116,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 
cookie = submit_bio(bio);
 
-   blk_poll(bdev_get_queue(req->ns->bdev), cookie);
+   blk_poll(bdev_get_queue(req->ns->bdev), cookie, true);
 }
 
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 64ba27b8b754..d233a59ea364 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -243,7 +243,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
break;
 
if (!(iocb->ki_flags & IOCB_HIPRI) ||
-   !blk_poll(bdev_get_queue(bdev), qc))
+   

[PATCH 3/7] nvme-fc: remove unused poll implementation

2018-11-17 Thread Jens Axboe
This relies on the fc taget ops setting ->poll_queue, which
nobody does. Otherwise it just checks if something has
completed, which isn't very useful.

Signed-off-by: Jens Axboe 
---
 drivers/nvme/host/fc.c | 33 -
 1 file changed, 33 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 98c3c77f48f6..de797c641265 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2302,38 +2302,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
 }
 
-static struct blk_mq_tags *
-nvme_fc_tagset(struct nvme_fc_queue *queue)
-{
-   if (queue->qnum == 0)
-   return queue->ctrl->admin_tag_set.tags[queue->qnum];
-
-   return queue->ctrl->tag_set.tags[queue->qnum - 1];
-}
-
-static int
-nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-
-{
-   struct nvme_fc_queue *queue = hctx->driver_data;
-   struct nvme_fc_ctrl *ctrl = queue->ctrl;
-   struct request *req;
-   struct nvme_fc_fcp_op *op;
-
-   req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
-   if (!req)
-   return 0;
-
-   op = blk_mq_rq_to_pdu(req);
-
-   if ((atomic_read(>state) == FCPOP_STATE_ACTIVE) &&
-(ctrl->lport->ops->poll_queue))
-   ctrl->lport->ops->poll_queue(>lport->localport,
-queue->lldd_handle);
-
-   return ((atomic_read(>state) != FCPOP_STATE_ACTIVE));
-}
-
 static void
 nvme_fc_submit_async_event(struct nvme_ctrl *arg)
 {
@@ -2404,7 +2372,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
.init_request   = nvme_fc_init_request,
.exit_request   = nvme_fc_exit_request,
.init_hctx  = nvme_fc_init_hctx,
-   .poll   = nvme_fc_poll,
.timeout= nvme_fc_timeout,
 };
 
-- 
2.17.1



[PATCH 7/7] blk-mq: ensure mq_ops ->poll() is entered at least once

2018-11-17 Thread Jens Axboe
Right now we immediately bail if need_resched() is true, but
we need to do at least one loop in case we have entries waiting.
So just invert the need_resched() check, putting it at the
bottom of the loop.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0a2847d9248b..4769c975b8c8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3336,7 +3336,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, bool 
spin)
hctx->poll_considered++;
 
state = current->state;
-   while (!need_resched()) {
+   do {
int ret;
 
hctx->poll_invoked++;
@@ -3356,7 +3356,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, bool 
spin)
if (ret < 0 || !spin)
break;
cpu_relax();
-   }
+   } while (!need_resched());
 
__set_current_state(TASK_RUNNING);
return 0;
-- 
2.17.1



[PATCH 4/7] blk-mq: when polling for IO, look for any completion

2018-11-17 Thread Jens Axboe
If we want to support async IO polling, then we have to allow
finding completions that aren't just for the one we are
looking for. Always pass in -1 to the mq_ops->poll() helper,
and have that return how many events were found in this poll
loop.

Signed-off-by: Jens Axboe 
---
 block/blk-mq.c   | 69 +++-
 drivers/nvme/host/pci.c  | 14 
 drivers/nvme/host/rdma.c | 36 ++---
 3 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 52b1c97cd7c6..3ca00d712158 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3266,9 +3266,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue 
*q,
 *  0:  use half of prev avg
 * >0:  use this specific value
 */
-   if (q->poll_nsec == -1)
-   return false;
-   else if (q->poll_nsec > 0)
+   if (q->poll_nsec > 0)
nsecs = q->poll_nsec;
else
nsecs = blk_mq_poll_nsecs(q, hctx, rq);
@@ -3305,21 +3303,36 @@ static bool blk_mq_poll_hybrid_sleep(struct 
request_queue *q,
return true;
 }
 
-static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q,
+  struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
+{
+   struct request *rq;
+
+   if (q->poll_nsec == -1)
+   return false;
+
+   if (!blk_qc_t_is_internal(cookie))
+   rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+   else {
+   rq = blk_mq_tag_to_rq(hctx->sched_tags, 
blk_qc_t_to_tag(cookie));
+   /*
+* With scheduling, if the request has completed, we'll
+* get a NULL return here, as we clear the sched tag when
+* that happens. The request still remains valid, like always,
+* so we should be safe with just the NULL check.
+*/
+   if (!rq)
+   return false;
+   }
+
+   return blk_mq_poll_hybrid_sleep(q, hctx, rq);
+}
+
+static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
long state;
 
-   /*
-* If we sleep, have the caller restart the poll loop to reset
-* the state. Like for the other success return cases, the
-* caller is responsible for checking if the IO completed. If
-* the IO isn't complete, we'll get called again and will go
-* straight to the busy poll loop.
-*/
-   if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
-   return 1;
-
hctx->poll_considered++;
 
state = current->state;
@@ -3328,7 +3341,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
 
hctx->poll_invoked++;
 
-   ret = q->mq_ops->poll(hctx, rq->tag);
+   ret = q->mq_ops->poll(hctx, -1U);
if (ret > 0) {
hctx->poll_success++;
__set_current_state(TASK_RUNNING);
@@ -3352,27 +3365,23 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
 static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
struct blk_mq_hw_ctx *hctx;
-   struct request *rq;
 
if (!test_bit(QUEUE_FLAG_POLL, >queue_flags))
return 0;
 
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-   if (!blk_qc_t_is_internal(cookie))
-   rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-   else {
-   rq = blk_mq_tag_to_rq(hctx->sched_tags, 
blk_qc_t_to_tag(cookie));
-   /*
-* With scheduling, if the request has completed, we'll
-* get a NULL return here, as we clear the sched tag when
-* that happens. The request still remains valid, like always,
-* so we should be safe with just the NULL check.
-*/
-   if (!rq)
-   return 0;
-   }
 
-   return __blk_mq_poll(hctx, rq);
+   /*
+* If we sleep, have the caller restart the poll loop to reset
+* the state. Like for the other success return cases, the
+* caller is responsible for checking if the IO completed. If
+* the IO isn't complete, we'll get called again and will go
+* straight to the busy poll loop.
+*/
+   if (blk_mq_poll_hybrid(q, hctx, cookie))
+   return 1;
+
+   return __blk_mq_poll(hctx);
 }
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 89874e23e422..1742c8ab8196 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1012,15 +1012,15 @@ static inline void nvme_update_cq_head(struct 
nvme_queue *nvmeq)
}
 }
 
-static inline bool nvme_process_cq(struct nvme_queue *nvmeq, 

[PATCHSET v4] Various block optimizations

2018-11-17 Thread Jens Axboe
Some of these are optimizations, the latter part is prep work
for supporting polling with aio.

Patches against my for-4.21/block branch. These patches can also
be found in my mq-perf branch. Some of them are prep patches for
the aio poll work, which can be found in my aio-poll branch.
These will get posted soon.

Changes since v3:

- Fix tiny race in iocb flag checking for plugging
- Improve poll-many implementation, including nvme-rdma
- Get rid of unneeded smp_rmb()
- Drop merged patches

Changes since v2:

- Include polled swap IO in the poll optimizations
- Get rid of unnecessary write barrier for DIO wakeup
- Fix a potential stall if need_resched() was set and preempt
  wasn't enabled
- Provide separate mq_ops for NVMe with poll queues
- Drop q->mq_ops patch
- Rebase on top of for-4.21/block

Changes since v1:

- Improve nvme irq disabling for polled IO
- Fix barriers in the ordered wakeup for polled O_DIRECT
- Add patch to allow polling to find any command that is done
- Add patch to control whether polling spins or not
- Have async O_DIRECT mark a bio as pollable
- Don't plug for polling


 block/blk-core.c  |  4 +-
 block/blk-mq.c| 91 +--
 drivers/nvme/host/fc.c| 33 --
 drivers/nvme/host/multipath.c |  6 +--
 drivers/nvme/host/pci.c   | 22 +-
 drivers/nvme/host/rdma.c  | 38 
 drivers/nvme/target/io-cmd-bdev.c |  2 +-
 fs/block_dev.c| 11 +++--
 fs/direct-io.c|  2 +-
 fs/iomap.c|  5 ++-
 include/linux/blk-mq.h|  2 +-
 include/linux/blkdev.h|  4 +-
 mm/page_io.c  |  5 ++-
 13 files changed, 101 insertions(+), 124 deletions(-)

-- 
Jens Axboe




[RFC] block: add io timeout to sysfs

2018-11-17 Thread Weiping Zhang
Give a interface to adjust io timeout by device.

Signed-off-by: Weiping Zhang 
---
 block/blk-sysfs.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..0cabfb935e71 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, 
const char *page,
return ret;
 }
 
+static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+{
+   return sprintf(page, "%u\n", q->rq_timeout);
+}
+
+static ssize_t queue_io_timeout_store(struct request_queue *q, const char 
*page,
+ size_t count)
+{
+   unsigned int val;
+   int err;
+
+   err = kstrtou32(page, 10, );
+   if (err)
+   return -EINVAL;
+
+   blk_queue_rq_timeout(q, val);
+
+   return count;
+}
+
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
if (!wbt_rq_qos(q))
@@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_io_timeout_entry = {
+   .attr = {.name = "io_timeout", .mode = 0644 },
+   .show = queue_io_timeout_show,
+   .store = queue_io_timeout_store,
+};
+
 static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = 0644 },
.show = queue_wb_lat_show,
@@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
_dax_entry.attr,
_wb_lat_entry.attr,
_poll_delay_entry.attr,
+   _io_timeout_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
_sample_time_entry.attr,
 #endif
-- 
2.14.1



Re: [PATCH 6/6] mmc: stop abusing the request queue_lock pointer

2018-11-17 Thread Ulf Hansson
On 16 November 2018 at 09:10, Christoph Hellwig  wrote:
> Replace the lock in mmc_blk_data that is only used through a pointer
> in struct mmc_queue and to protect fields in that structure with
> an actual lock in struct mmc_queue.
>
> Suggested-by: Ulf Hansson 
> Signed-off-by: Christoph Hellwig 

Looks good, thanks!

Reviewed-by: Ulf Hansson 

Kind regards
Uffe

> ---
>  drivers/mmc/core/block.c | 24 +++-
>  drivers/mmc/core/queue.c | 31 +++
>  drivers/mmc/core/queue.h |  4 ++--
>  3 files changed, 28 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
> index 70ec465beb69..2c329a3e3fdb 100644
> --- a/drivers/mmc/core/block.c
> +++ b/drivers/mmc/core/block.c
> @@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida);
>   * There is one mmc_blk_data per slot.
>   */
>  struct mmc_blk_data {
> -   spinlock_t  lock;
> struct device   *parent;
> struct gendisk  *disk;
> struct mmc_queue queue;
> @@ -1483,7 +1482,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue 
> *mq, struct request *req)
> blk_mq_end_request(req, BLK_STS_OK);
> }
>
> -   spin_lock_irqsave(mq->lock, flags);
> +   spin_lock_irqsave(>lock, flags);
>
> mq->in_flight[mmc_issue_type(mq, req)] -= 1;
>
> @@ -1491,7 +1490,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue 
> *mq, struct request *req)
>
> mmc_cqe_check_busy(mq);
>
> -   spin_unlock_irqrestore(mq->lock, flags);
> +   spin_unlock_irqrestore(>lock, flags);
>
> if (!mq->cqe_busy)
> blk_mq_run_hw_queues(q, true);
> @@ -1991,13 +1990,13 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue 
> *mq, struct request *req)
> unsigned long flags;
> bool put_card;
>
> -   spin_lock_irqsave(mq->lock, flags);
> +   spin_lock_irqsave(>lock, flags);
>
> mq->in_flight[mmc_issue_type(mq, req)] -= 1;
>
> put_card = (mmc_tot_in_flight(mq) == 0);
>
> -   spin_unlock_irqrestore(mq->lock, flags);
> +   spin_unlock_irqrestore(>lock, flags);
>
> if (put_card)
> mmc_put_card(mq->card, >ctx);
> @@ -2093,11 +2092,11 @@ static void mmc_blk_mq_req_done(struct mmc_request 
> *mrq)
>  * request does not need to wait (although it does need to
>  * complete complete_req first).
>  */
> -   spin_lock_irqsave(mq->lock, flags);
> +   spin_lock_irqsave(>lock, flags);
> mq->complete_req = req;
> mq->rw_wait = false;
> waiting = mq->waiting;
> -   spin_unlock_irqrestore(mq->lock, flags);
> +   spin_unlock_irqrestore(>lock, flags);
>
> /*
>  * If 'waiting' then the waiting task will complete this
> @@ -2116,10 +2115,10 @@ static void mmc_blk_mq_req_done(struct mmc_request 
> *mrq)
> /* Take the recovery path for errors or urgent background operations 
> */
> if (mmc_blk_rq_error(>brq) ||
> mmc_blk_urgent_bkops_needed(mq, mqrq)) {
> -   spin_lock_irqsave(mq->lock, flags);
> +   spin_lock_irqsave(>lock, flags);
> mq->recovery_needed = true;
> mq->recovery_req = req;
> -   spin_unlock_irqrestore(mq->lock, flags);
> +   spin_unlock_irqrestore(>lock, flags);
> wake_up(>wait);
> schedule_work(>recovery_work);
> return;
> @@ -2142,7 +2141,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, 
> int *err)
>  * Wait while there is another request in progress, but not if 
> recovery
>  * is needed. Also indicate whether there is a request waiting to 
> start.
>  */
> -   spin_lock_irqsave(mq->lock, flags);
> +   spin_lock_irqsave(>lock, flags);
> if (mq->recovery_needed) {
> *err = -EBUSY;
> done = true;
> @@ -2150,7 +2149,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, 
> int *err)
> done = !mq->rw_wait;
> }
> mq->waiting = !done;
> -   spin_unlock_irqrestore(mq->lock, flags);
> +   spin_unlock_irqrestore(>lock, flags);
>
> return done;
>  }
> @@ -2327,12 +2326,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct 
> mmc_card *card,
> goto err_kfree;
> }
>
> -   spin_lock_init(>lock);
> INIT_LIST_HEAD(>part);
> INIT_LIST_HEAD(>rpmbs);
> md->usage = 1;
>
> -   ret = mmc_init_queue(>queue, card, >lock);
> +   ret = mmc_init_queue(>queue, card);
> if (ret)
> goto err_putdisk;
>
> diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
> index 4485cf12218c..35cc138b096d 100644
> --- a/drivers/mmc/core/queue.c
> +++ b/drivers/mmc/core/queue.c
> @@ -89,9 +89,9 @@ void 

Re: [PATCH V2 for-4.21 2/2] blk-mq: alloc q->queue_ctx as normal array

2018-11-17 Thread Greg Kroah-Hartman
On Sat, Nov 17, 2018 at 10:34:18AM +0800, Ming Lei wrote:
> On Fri, Nov 16, 2018 at 06:06:23AM -0800, Greg Kroah-Hartman wrote:
> > On Fri, Nov 16, 2018 at 07:23:11PM +0800, Ming Lei wrote:
> > > Now q->queue_ctx is just one read-mostly table for query the
> > > 'blk_mq_ctx' instance from one cpu index, it isn't necessary
> > > to allocate it as percpu variable. One simple array may be
> > > more efficient.
> > 
> > "may be", have you run benchmarks to be sure?  If so, can you add the
> > results of them to this changelog?  If there is no measurable
> > difference, then why make this change at all?
> 
> __blk_mq_get_ctx() is used in fast path, what do you think about which
> one is more efficient?
> 
> - *per_cpu_ptr(q->queue_ctx, cpu);
> 
> - q->queue_ctx[cpu]

You need to actually test to see which one is faster, you might be
surprised :)

In other words, do not just guess.

> At least the latter isn't worse than the former.

How do you know?

> Especially q->queue_ctx is just a read-only look-up table, it doesn't
> make sense to make it percpu any more.
> 
> Not mention q->queue_ctx[cpu] is more clean/readable.

Again, please test to verify this.

thanks,

greg k-h