from:"Jens Axboe"

[PATCH 21/26] block: add BIO_HOLD_PAGES flag

2018-12-07 Thread Jens Axboe

For user mapped IO, we do get_user_pages() upfront, and then do a
put_page() on each page at end_io time to release the page reference. In
preparation for having permanently mapped pages, add a BIO_HOLD_PAGES
flag that tells us not to release the pages, the caller will do that.

Signed-off-by: Jens Axboe 
---
 block/bio.c   | 6 --
 include/linux/blk_types.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 06760543ec81..3e45e5650265 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1636,7 +1636,8 @@ static void bio_dirty_fn(struct work_struct *work)
next = bio->bi_private;
 
bio_set_pages_dirty(bio);
-   bio_release_pages(bio);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_release_pages(bio);
bio_put(bio);
}
 }
@@ -1652,7 +1653,8 @@ void bio_check_pages_dirty(struct bio *bio)
goto defer;
}
 
-   bio_release_pages(bio);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_release_pages(bio);
bio_put(bio);
return;
 defer:
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 921d734d6b5d..356a4c89b0d9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -228,6 +228,7 @@ struct bio {
 #define BIO_TRACE_COMPLETION 10/* bio_endio() should trace the final 
completion
 * of this bio. */
 #define BIO_QUEUE_ENTERED 11   /* can use blk_queue_enter_live() */
+#define BIO_HOLD_PAGES 12  /* don't put O_DIRECT pages */
 
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
-- 
2.17.1

[PATCH 20/26] aio: batch aio_kiocb allocation

2018-12-07 Thread Jens Axboe

Similarly to how we use the state->ios_left to know how many references
to get to a file, we can use it to allocate the aio_kiocb's we need in
bulk.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 47 +++
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 51c7159f09bf..52a0b2291f71 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -240,6 +240,8 @@ struct aio_kiocb {
};
 };
 
+#define AIO_IOPOLL_BATCH   8
+
 struct aio_submit_state {
struct kioctx *ctx;
 
@@ -254,6 +256,13 @@ struct aio_submit_state {
struct list_head req_list;
unsigned int req_count;
 
+   /*
+* aio_kiocb alloc cache
+*/
+   void *iocbs[AIO_IOPOLL_BATCH];
+   unsigned int free_iocbs;
+   unsigned int cur_iocb;
+
/*
 * File reference cache
 */
@@ -1113,15 +1122,35 @@ static void aio_iocb_init(struct kioctx *ctx, struct 
aio_kiocb *req)
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
  */
-static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
+static struct aio_kiocb *aio_get_req(struct kioctx *ctx,
+struct aio_submit_state *state)
 {
struct aio_kiocb *req;
 
-   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
-   if (unlikely(!req))
-   return NULL;
+   if (!state)
+   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+   else if (!state->free_iocbs) {
+   size_t size;
+
+   size = min_t(size_t, state->ios_left, ARRAY_SIZE(state->iocbs));
+   size = kmem_cache_alloc_bulk(kiocb_cachep, GFP_KERNEL, size,
+   state->iocbs);
+   if (size < 0)
+   return ERR_PTR(size);
+   else if (!size)
+   return ERR_PTR(-ENOMEM);
+   state->free_iocbs = size - 1;
+   state->cur_iocb = 1;
+   req = state->iocbs[0];
+   } else {
+   req = state->iocbs[state->cur_iocb];
+   state->free_iocbs--;
+   state->cur_iocb++;
+   }
+
+   if (req)
+   aio_iocb_init(ctx, req);
 
-   aio_iocb_init(ctx, req);
return req;
 }
 
@@ -1359,8 +1388,6 @@ static bool aio_read_events(struct kioctx *ctx, long 
min_nr, long nr,
return ret < 0 || *i >= min_nr;
 }
 
-#define AIO_IOPOLL_BATCH   8
-
 /*
  * Process completed iocb iopoll entries, copying the result to userspace.
  */
@@ -2360,7 +2387,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
return -EAGAIN;
 
ret = -EAGAIN;
-   req = aio_get_req(ctx);
+   req = aio_get_req(ctx, state);
if (unlikely(!req))
goto out_put_reqs_available;
 
@@ -2492,6 +2519,9 @@ static void aio_submit_state_end(struct aio_submit_state 
*state)
if (!list_empty(>req_list))
aio_flush_state_reqs(state->ctx, state);
aio_file_put(state);
+   if (state->free_iocbs)
+   kmem_cache_free_bulk(kiocb_cachep, state->free_iocbs,
+   >iocbs[state->cur_iocb]);
 }
 
 /*
@@ -2503,6 +2533,7 @@ static void aio_submit_state_start(struct 
aio_submit_state *state,
state->ctx = ctx;
INIT_LIST_HEAD(>req_list);
state->req_count = 0;
+   state->free_iocbs = 0;
state->file = NULL;
state->ios_left = max_ios;
 #ifdef CONFIG_BLOCK
-- 
2.17.1

[PATCH 26/26] aio: add support for submission/completion rings

2018-12-07 Thread Jens Axboe

Experimental support for submitting and completing IO through rings
shared between the application and kernel.

The submission rings are struct iocb, like we would submit through
io_submit(), and the completion rings are struct io_event, like we
would pass in (and copy back) from io_getevents().

A new system call is added for this, io_ring_enter(). This system
call submits IO that is queued in the SQ ring, and/or completes IO
and stores the results in the CQ ring.

This could be augmented with a kernel thread that does the submission
and polling, then the application would never have to enter the
kernel to do IO.

Sample application: http://brick.kernel.dk/snaps/aio-ring.c

Signed-off-by: Jens Axboe 
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/aio.c   | 435 +++--
 include/linux/syscalls.h   |   4 +-
 include/uapi/linux/aio_abi.h   |  26 ++
 kernel/sys_ni.c|   1 +
 5 files changed, 433 insertions(+), 34 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 67c357225fb0..55a26700a637 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
 333common  io_pgetevents   __x64_sys_io_pgetevents
 334common  rseq__x64_sys_rseq
 335common  io_setup2   __x64_sys_io_setup2
+336common  io_ring_enter   __x64_sys_io_ring_enter
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index de48faeab0fd..b00c9fb9fa35 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -142,6 +142,11 @@ struct kioctx {
 
struct aio_mapped_range iocb_range;
 
+   /* if used, completion and submission rings */
+   struct aio_mapped_range sq_ring;
+   struct aio_mapped_range cq_ring;
+   int cq_ring_overflow;
+
struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
@@ -297,6 +302,8 @@ static const struct address_space_operations aio_ctx_aops;
 
 static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
+static const unsigned int event_page_shift =
+   ilog2(PAGE_SIZE / sizeof(struct io_event));
 
 /*
  * We rely on block level unplugs to flush pending requests, if we schedule
@@ -307,6 +314,7 @@ static const bool aio_use_state_req_list = true;
 static const bool aio_use_state_req_list = false;
 #endif
 
+static void aio_scqring_unmap(struct kioctx *);
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
 static void aio_iocb_buffer_unmap(struct kioctx *);
@@ -539,6 +547,12 @@ static const struct address_space_operations aio_ctx_aops 
= {
 #endif
 };
 
+/* Polled IO or SQ/CQ rings don't use the old ring */
+static bool aio_ctx_old_ring(struct kioctx *ctx)
+{
+   return !(ctx->flags & (IOCTX_FLAG_IOPOLL | IOCTX_FLAG_SCQRING));
+}
+
 static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 {
struct aio_ring *ring;
@@ -553,7 +567,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int 
nr_events)
 * IO polling doesn't require any io event entries
 */
size = sizeof(struct aio_ring);
-   if (!(ctx->flags & IOCTX_FLAG_IOPOLL)) {
+   if (aio_ctx_old_ring(ctx)) {
nr_events += 2; /* 1 is required, 2 for good luck */
size += sizeof(struct io_event) * nr_events;
}
@@ -640,6 +654,17 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int 
nr_events)
return 0;
 }
 
+/*
+ * Don't support cancel on anything that isn't regular aio
+ */
+static bool aio_ctx_supports_cancel(struct kioctx *ctx)
+{
+   int noflags = IOCTX_FLAG_USERIOCB | IOCTX_FLAG_IOPOLL |
+ IOCTX_FLAG_SCQRING;
+
+   return (ctx->flags & noflags) == 0;
+}
+
 #define AIO_EVENTS_PER_PAGE(PAGE_SIZE / sizeof(struct io_event))
 #define AIO_EVENTS_FIRST_PAGE  ((PAGE_SIZE - sizeof(struct aio_ring)) / 
sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET  (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
@@ -650,6 +675,8 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, 
kiocb_cancel_fn *cancel)
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
 
+   if (WARN_ON_ONCE(!aio_ctx_supports_cancel(ctx)))
+   return;
if (WARN_ON_ONCE(!list_empty(>ki_list)))
return;
 
@@ -673,6 +700,7 @@ static void free_ioctx(struct work_struct *work)
 
aio_iocb_buffer_unmap(ctx);
aio_useriocb_unmap(ctx);
+   aio_scqring_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(>reqs);
@@ -1218,6 +1246,47 @@ static void aio_fill_event(struct io_event *ev, struct 
aio_kiocb *iocb,
ev->res2

[PATCH 25/26] aio: split old ring complete out from aio_complete()

2018-12-07 Thread Jens Axboe

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index fd323b3ba499..de48faeab0fd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1218,12 +1218,9 @@ static void aio_fill_event(struct io_event *ev, struct 
aio_kiocb *iocb,
ev->res2 = res2;
 }
 
-/* aio_complete
- * Called when the io request on the given iocb is complete.
- */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_ring_complete(struct kioctx *ctx, struct aio_kiocb *iocb,
+ long res, long res2)
 {
-   struct kioctx   *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
@@ -1273,6 +1270,16 @@ static void aio_complete(struct aio_kiocb *iocb, long 
res, long res2)
spin_unlock_irqrestore(>completion_lock, flags);
 
pr_debug("added to ring %p at [%u]\n", iocb, tail);
+}
+
+/* aio_complete
+ * Called when the io request on the given iocb is complete.
+ */
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+{
+   struct kioctx *ctx = iocb->ki_ctx;
+
+   aio_ring_complete(ctx, iocb, res, res2);
 
/*
 * Check if the user asked us to deliver the result through an
-- 
2.17.1

[PATCH 17/26] fs: add fget_many() and fput_many()

2018-12-07 Thread Jens Axboe

Some uses cases repeatedly get and put references to the same file, but
the only exposed interface is doing these one at the time. As each of
these entail an atomic inc or dec on a shared structure, that cost can
add up.

Add fget_many(), which works just like fget(), except it takes an
argument for how many references to get on the file. Ditto fput_many(),
which can drop an arbitrary number of references to a file.

Signed-off-by: Jens Axboe 
---
 fs/file.c| 15 ++-
 fs/file_table.c  | 10 --
 include/linux/file.h |  2 ++
 include/linux/fs.h   |  3 ++-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9d103d..ad9870edfd51 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -676,7 +676,7 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(>file_lock);
 }
 
-static struct file *__fget(unsigned int fd, fmode_t mask)
+static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
 {
struct files_struct *files = current->files;
struct file *file;
@@ -691,7 +691,7 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
 */
if (file->f_mode & mask)
file = NULL;
-   else if (!get_file_rcu(file))
+   else if (!get_file_rcu_many(file, refs))
goto loop;
}
rcu_read_unlock();
@@ -699,15 +699,20 @@ static struct file *__fget(unsigned int fd, fmode_t mask)
return file;
 }
 
+struct file *fget_many(unsigned int fd, unsigned int refs)
+{
+   return __fget(fd, FMODE_PATH, refs);
+}
+
 struct file *fget(unsigned int fd)
 {
-   return __fget(fd, FMODE_PATH);
+   return fget_many(fd, 1);
 }
 EXPORT_SYMBOL(fget);
 
 struct file *fget_raw(unsigned int fd)
 {
-   return __fget(fd, 0);
+   return __fget(fd, 0, 1);
 }
 EXPORT_SYMBOL(fget_raw);
 
@@ -738,7 +743,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t 
mask)
return 0;
return (unsigned long)file;
} else {
-   file = __fget(fd, mask);
+   file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
diff --git a/fs/file_table.c b/fs/file_table.c
index e49af4caf15d..6a3964df33e4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -326,9 +326,9 @@ void flush_delayed_fput(void)
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
-void fput(struct file *file)
+void fput_many(struct file *file, unsigned int refs)
 {
-   if (atomic_long_dec_and_test(>f_count)) {
+   if (atomic_long_sub_and_test(refs, >f_count)) {
struct task_struct *task = current;
 
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -347,6 +347,12 @@ void fput(struct file *file)
}
 }
 
+void fput(struct file *file)
+{
+   fput_many(file, 1);
+}
+
+
 /*
  * synchronous analog of fput(); for kernel threads that might be needed
  * in some umount() (and thus can't use flush_delayed_fput() without
diff --git a/include/linux/file.h b/include/linux/file.h
index 6b2fb032416c..3fcddff56bc4 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -13,6 +13,7 @@
 struct file;
 
 extern void fput(struct file *);
+extern void fput_many(struct file *, unsigned int);
 
 struct file_operations;
 struct vfsmount;
@@ -44,6 +45,7 @@ static inline void fdput(struct fd fd)
 }
 
 extern struct file *fget(unsigned int fd);
+extern struct file *fget_many(unsigned int fd, unsigned int refs);
 extern struct file *fget_raw(unsigned int fd);
 extern unsigned long __fdget(unsigned int fd);
 extern unsigned long __fdget_raw(unsigned int fd);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6a5f71f8ae06..dc54a65c401a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -952,7 +952,8 @@ static inline struct file *get_file(struct file *f)
atomic_long_inc(>f_count);
return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+#define get_file_rcu_many(x, cnt) atomic_long_add_unless(&(x)->f_count, (cnt), 
0)
+#define get_file_rcu(x) get_file_rcu_many((x), 1)
 #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)  atomic_long_read(&(x)->f_count)
 
-- 
2.17.1

[PATCH 18/26] aio: use fget/fput_many() for file references

2018-12-07 Thread Jens Axboe

On the submission side, add file reference batching to the
aio_submit_state. We get as many references as the number of iocbs we
are submitting, and drop unused ones if we end up switching files. The
assumption here is that we're usually only dealing with one fd, and if
there are multiple, hopefuly they are at least somewhat ordered. Could
trivially be extended to cover multiple fds, if needed.

On the completion side we do the same thing, except this is trivially
done just locally in aio_iopoll_reap().

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 106 +++
 1 file changed, 91 insertions(+), 15 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 5e840396f6d8..3c07cc9cb11a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -253,6 +253,15 @@ struct aio_submit_state {
 */
struct list_head req_list;
unsigned int req_count;
+
+   /*
+* File reference cache
+*/
+   struct file *file;
+   unsigned int fd;
+   unsigned int has_refs;
+   unsigned int used_refs;
+   unsigned int ios_left;
 };
 
 /*-- sysctl variables*/
@@ -1355,7 +1364,8 @@ static long aio_iopoll_reap(struct kioctx *ctx, struct 
io_event __user *evs,
 {
void *iocbs[AIO_IOPOLL_BATCH];
struct aio_kiocb *iocb, *n;
-   int to_free = 0, ret = 0;
+   int file_count, to_free = 0, ret = 0;
+   struct file *file = NULL;
 
/* Shouldn't happen... */
if (*nr_events >= max)
@@ -1372,7 +1382,20 @@ static long aio_iopoll_reap(struct kioctx *ctx, struct 
io_event __user *evs,
list_del(>ki_list);
iocbs[to_free++] = iocb;
 
-   fput(iocb->rw.ki_filp);
+   /*
+* Batched puts of the same file, to avoid dirtying the
+* file usage count multiple times, if avoidable.
+*/
+   if (!file) {
+   file = iocb->rw.ki_filp;
+   file_count = 1;
+   } else if (file == iocb->rw.ki_filp) {
+   file_count++;
+   } else {
+   fput_many(file, file_count);
+   file = iocb->rw.ki_filp;
+   file_count = 1;
+   }
 
if (evs && copy_to_user(evs + *nr_events, >ki_ev,
sizeof(iocb->ki_ev))) {
@@ -1382,6 +1405,9 @@ static long aio_iopoll_reap(struct kioctx *ctx, struct 
io_event __user *evs,
(*nr_events)++;
}
 
+   if (file)
+   fput_many(file, file_count);
+
if (to_free)
iocb_put_many(ctx, iocbs, _free);
 
@@ -1807,13 +1833,58 @@ static void aio_complete_rw_poll(struct kiocb *kiocb, 
long res, long res2)
}
 }
 
-static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb)
+static void aio_file_put(struct aio_submit_state *state)
+{
+   if (state->file) {
+   int diff = state->has_refs - state->used_refs;
+
+   if (diff)
+   fput_many(state->file, diff);
+   state->file = NULL;
+   }
+}
+
+/*
+ * Get as many references to a file as we have IOs left in this submission,
+ * assuming most submissions are for one file, or at least that each file
+ * has more than one submission.
+ */
+static struct file *aio_file_get(struct aio_submit_state *state, int fd)
+{
+   if (!state)
+   return fget(fd);
+
+   if (!state->file) {
+get_file:
+   state->file = fget_many(fd, state->ios_left);
+   if (!state->file)
+   return NULL;
+
+   state->fd = fd;
+   state->has_refs = state->ios_left;
+   state->used_refs = 1;
+   state->ios_left--;
+   return state->file;
+   }
+
+   if (state->fd == fd) {
+   state->used_refs++;
+   state->ios_left--;
+   return state->file;
+   }
+
+   aio_file_put(state);
+   goto get_file;
+}
+
+static int aio_prep_rw(struct aio_kiocb *kiocb, const struct iocb *iocb,
+  struct aio_submit_state *state)
 {
struct kioctx *ctx = kiocb->ki_ctx;
struct kiocb *req = >rw;
int ret;
 
-   req->ki_filp = fget(iocb->aio_fildes);
+   req->ki_filp = aio_file_get(state, iocb->aio_fildes);
if (unlikely(!req->ki_filp))
return -EBADF;
req->ki_pos = iocb->aio_offset;
@@ -1963,7 +2034,8 @@ static void aio_iopoll_iocb_issued(struct 
aio_submit_state *state,
 }
 
 static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb,
-   bool vectored, bool compat)
+   struct aio_submit_state *state, bool vectored,
+   bool compat)
 {
struct iovec inline_vec

[PATCH 24/26] aio: add support for pre-mapped user IO buffers

2018-12-07 Thread Jens Axboe

If we have fixed user buffers, we can map them into the kernel when we
setup the io_context. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must set both
IOCTX_FLAG_USERIOCB, to provide iocb's in userspace, and then
IOCTX_FLAG_FIXEDBUFS. The latter tells aio that the iocbs that are
mapped already contain valid destination and sizes. These buffers can
then be mapped into the kernel for the life time of the io_context, as
opposed to just the duration of the each single IO.

Only works with non-vectored read/write commands for now, not with
PREADV/PWRITEV.

A limit of 4M is imposed as the largest buffer we currently support.
There's nothing preventing us from going larger, but we need some cap,
and 4M seemed like it would definitely be big enough. RLIMIT_MEMLOCK
is used to cap the total amount of memory pinned.

See the fio change for how to utilize this feature:

http://git.kernel.dk/cgit/fio/commit/?id=2041bd343da1c1e955253f62374588718c64f0f3

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 193 ---
 include/uapi/linux/aio_abi.h |   1 +
 2 files changed, 177 insertions(+), 17 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 52a0b2291f71..fd323b3ba499 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -97,6 +98,11 @@ struct aio_mapped_range {
long nr_pages;
 };
 
+struct aio_mapped_ubuf {
+   struct bio_vec *bvec;
+   unsigned int nr_bvecs;
+};
+
 struct kioctx {
struct percpu_ref   users;
atomic_tdead;
@@ -132,6 +138,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
+   struct aio_mapped_ubuf  *user_bufs;
+
struct aio_mapped_range iocb_range;
 
struct rcu_work free_rwork; /* see free_ioctx() */
@@ -301,6 +309,7 @@ static const bool aio_use_state_req_list = false;
 
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
+static void aio_iocb_buffer_unmap(struct kioctx *);
 
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
@@ -662,6 +671,7 @@ static void free_ioctx(struct work_struct *work)
  free_rwork);
pr_debug("freeing %p\n", ctx);
 
+   aio_iocb_buffer_unmap(ctx);
aio_useriocb_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
@@ -1675,6 +1685,122 @@ static int aio_useriocb_map(struct kioctx *ctx, struct 
iocb __user *iocbs)
return aio_map_range(>iocb_range, iocbs, size, 0);
 }
 
+static void aio_iocb_buffer_unmap(struct kioctx *ctx)
+{
+   int i, j;
+
+   if (!ctx->user_bufs)
+   return;
+
+   for (i = 0; i < ctx->max_reqs; i++) {
+   struct aio_mapped_ubuf *amu = >user_bufs[i];
+
+   for (j = 0; j < amu->nr_bvecs; j++)
+   put_page(amu->bvec[j].bv_page);
+
+   kfree(amu->bvec);
+   amu->nr_bvecs = 0;
+   }
+
+   kfree(ctx->user_bufs);
+   ctx->user_bufs = NULL;
+}
+
+static int aio_iocb_buffer_map(struct kioctx *ctx)
+{
+   unsigned long total_pages, page_limit;
+   struct page **pages = NULL;
+   int i, j, got_pages = 0;
+   struct iocb *iocb;
+   int ret = -EINVAL;
+
+   ctx->user_bufs = kzalloc(ctx->max_reqs * sizeof(struct aio_mapped_ubuf),
+   GFP_KERNEL);
+   if (!ctx->user_bufs)
+   return -ENOMEM;
+
+   /* Don't allow more pages than we can safely lock */
+   total_pages = 0;
+   page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+   for (i = 0; i < ctx->max_reqs; i++) {
+   struct aio_mapped_ubuf *amu = >user_bufs[i];
+   unsigned long off, start, end, ubuf;
+   int pret, nr_pages;
+   size_t size;
+
+   iocb = aio_iocb_from_index(ctx, i);
+
+   /*
+* Don't impose further limits on the size and buffer
+* constraints here, we'll -EINVAL later when IO is
+* submitted if they are wrong.
+*/
+   ret = -EFAULT;
+   if (!iocb->aio_buf)
+   goto err;
+
+   /* arbitrary limit, but we need something */
+   if (iocb->aio_nbytes > SZ_4M)
+   goto err;
+
+   ubuf = iocb->aio_buf;
+   end = (ubuf + iocb->aio_nbytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   start = ubuf >> PAGE_SHIFT;
+   nr_pages = end - start;
+
+   ret = -ENOMEM;
+   if (total_pages + nr_pages > page_limit)
+   goto err;
+
+   if

[PATCH 19/26] aio: split iocb init from allocation

2018-12-07 Thread Jens Axboe

In preparation from having pre-allocated requests, that we then just
need to initialize before use.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 3c07cc9cb11a..51c7159f09bf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1099,6 +1099,16 @@ static bool get_reqs_available(struct kioctx *ctx)
return __get_reqs_available(ctx);
 }
 
+static void aio_iocb_init(struct kioctx *ctx, struct aio_kiocb *req)
+{
+   percpu_ref_get(>reqs);
+   req->ki_ctx = ctx;
+   INIT_LIST_HEAD(>ki_list);
+   req->ki_flags = 0;
+   refcount_set(>ki_refcnt, 0);
+   req->ki_eventfd = NULL;
+}
+
 /* aio_get_req
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -,12 +1121,7 @@ static inline struct aio_kiocb *aio_get_req(struct 
kioctx *ctx)
if (unlikely(!req))
return NULL;
 
-   percpu_ref_get(>reqs);
-   req->ki_ctx = ctx;
-   INIT_LIST_HEAD(>ki_list);
-   req->ki_flags = 0;
-   refcount_set(>ki_refcnt, 0);
-   req->ki_eventfd = NULL;
+   aio_iocb_init(ctx, req);
return req;
 }
 
-- 
2.17.1

[PATCH 23/26] fs: add support for mapping an ITER_BVEC for O_DIRECT

2018-12-07 Thread Jens Axboe

This adds support for sync/async O_DIRECT to make a bvec type iter
for bdev access, as well as iomap.

Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 16 
 fs/iomap.c | 10 +++---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b8f574615792..236c6abe649d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -219,7 +219,10 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
bio.bi_end_io = blkdev_bio_end_io_simple;
bio.bi_ioprio = iocb->ki_ioprio;
 
-   ret = bio_iov_iter_get_pages(, iter);
+   if (iov_iter_is_bvec(iter))
+   ret = bio_iov_bvec_add_pages(, iter);
+   else
+   ret = bio_iov_iter_get_pages(, iter);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -326,8 +329,9 @@ static void blkdev_bio_end_io(struct bio *bio)
struct bio_vec *bvec;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i)
-   put_page(bvec->bv_page);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_for_each_segment_all(bvec, bio, i)
+   put_page(bvec->bv_page);
bio_put(bio);
}
 }
@@ -381,7 +385,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
 
-   ret = bio_iov_iter_get_pages(bio, iter);
+   if (iov_iter_is_bvec(iter))
+   ret = bio_iov_bvec_add_pages(bio, iter);
+   else
+   ret = bio_iov_iter_get_pages(bio, iter);
+
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
diff --git a/fs/iomap.c b/fs/iomap.c
index bd483fcb7b5a..e36ad0be03f5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1573,8 +1573,9 @@ static void iomap_dio_bio_end_io(struct bio *bio)
struct bio_vec *bvec;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i)
-   put_page(bvec->bv_page);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_for_each_segment_all(bvec, bio, i)
+   put_page(bvec->bv_page);
bio_put(bio);
}
 }
@@ -1673,7 +1674,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
loff_t length,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
 
-   ret = bio_iov_iter_get_pages(bio, );
+   if (iov_iter_is_bvec())
+   ret = bio_iov_bvec_add_pages(bio, );
+   else
+   ret = bio_iov_iter_get_pages(bio, );
if (unlikely(ret)) {
/*
 * We have to stop part way through an IO. We must fall
-- 
2.17.1

[PATCH 22/26] block: implement bio helper to add iter bvec pages to bio

2018-12-07 Thread Jens Axboe

For an ITER_BVEC, we can just iterate the iov and add the pages
to the bio directly.

Signed-off-by: Jens Axboe 
---
 block/bio.c | 27 +++
 include/linux/bio.h |  1 +
 2 files changed, 28 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 3e45e5650265..8158edeb750e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -904,6 +904,33 @@ int bio_iov_iter_get_pages(struct bio *bio, struct 
iov_iter *iter)
 }
 EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
 
+/**
+ * bio_iov_bvec_add_pages - add pages from an ITER_BVEC to a bio
+ * @bio: bio to add pages to
+ * @iter: iov iterator describing the region to be added
+ *
+ * Iterate pages in the @iter and add them to the bio. We flag the
+ * @bio with BIO_HOLD_PAGES, telling IO completion not to free them.
+ */
+int bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+{
+   unsigned short orig_vcnt = bio->bi_vcnt;
+   const struct bio_vec *bv;
+
+   do {
+   size_t size;
+
+   bv = iter->bvec + iter->iov_offset;
+   size = bio_add_page(bio, bv->bv_page, bv->bv_len, 
bv->bv_offset);
+   if (size != bv->bv_len)
+   break;
+   iov_iter_advance(iter, size);
+   } while (iov_iter_count(iter) && !bio_full(bio));
+
+   bio_set_flag(bio, BIO_HOLD_PAGES);
+   return bio->bi_vcnt > orig_vcnt ? 0 : -EINVAL;
+}
+
 static void submit_bio_wait_endio(struct bio *bio)
 {
complete(bio->bi_private);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7380b094dcca..ca25ea890192 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -434,6 +434,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
*page,
 void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
+int bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter);
 struct rq_map_data;
 extern struct bio *bio_map_user_iov(struct request_queue *,
struct iov_iter *, gfp_t);
-- 
2.17.1

[PATCH 16/26] aio: add submission side request cache

2018-12-07 Thread Jens Axboe

We have to add each submitted polled request to the io_context
poll_submitted list, which means we have to grab the poll_lock. We
already use the block plug to batch submissions if we're doing a batch
of IO submissions, extend that to cover the poll requests internally as
well.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 136 +--
 1 file changed, 113 insertions(+), 23 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index cc8763b395c1..5e840396f6d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -240,6 +240,21 @@ struct aio_kiocb {
};
 };
 
+struct aio_submit_state {
+   struct kioctx *ctx;
+
+   struct blk_plug plug;
+#ifdef CONFIG_BLOCK
+   struct blk_plug_cb plug_cb;
+#endif
+
+   /*
+* Polled iocbs that have been submitted, but not added to the ctx yet
+*/
+   struct list_head req_list;
+   unsigned int req_count;
+};
+
 /*-- sysctl variables*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;  /* current system wide number of aio requests */
@@ -257,6 +272,15 @@ static const struct address_space_operations aio_ctx_aops;
 static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
 
+/*
+ * We rely on block level unplugs to flush pending requests, if we schedule
+ */
+#ifdef CONFIG_BLOCK
+static const bool aio_use_state_req_list = true;
+#else
+static const bool aio_use_state_req_list = false;
+#endif
+
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
 
@@ -1890,13 +1914,28 @@ static inline void aio_rw_done(struct kiocb *req, 
ssize_t ret)
}
 }
 
+/*
+ * Called either at the end of IO submission, or through a plug callback
+ * because we're going to schedule. Moves out local batch of requests to
+ * the ctx poll list, so they can be found for polling + reaping.
+ */
+static void aio_flush_state_reqs(struct kioctx *ctx,
+struct aio_submit_state *state)
+{
+   spin_lock(>poll_lock);
+   list_splice_tail_init(>req_list, >poll_submitted);
+   spin_unlock(>poll_lock);
+   state->req_count = 0;
+}
+
 /*
  * After the iocb has been issued, it's safe to be found on the poll list.
  * Adding the kiocb to the list AFTER submission ensures that we don't
  * find it from a io_getevents() thread before the issuer is done accessing
  * the kiocb cookie.
  */
-static void aio_iopoll_iocb_issued(struct aio_kiocb *kiocb)
+static void aio_iopoll_iocb_issued(struct aio_submit_state *state,
+  struct aio_kiocb *kiocb)
 {
/*
 * For fast devices, IO may have already completed. If it has, add
@@ -1906,12 +1945,21 @@ static void aio_iopoll_iocb_issued(struct aio_kiocb 
*kiocb)
const int front_add = test_bit(IOCB_POLL_COMPLETED, >ki_flags);
struct kioctx *ctx = kiocb->ki_ctx;
 
-   spin_lock(>poll_lock);
-   if (front_add)
-   list_add(>ki_list, >poll_submitted);
-   else
-   list_add_tail(>ki_list, >poll_submitted);
-   spin_unlock(>poll_lock);
+   if (!state || !aio_use_state_req_list) {
+   spin_lock(>poll_lock);
+   if (front_add)
+   list_add(>ki_list, >poll_submitted);
+   else
+   list_add_tail(>ki_list, >poll_submitted);
+   spin_unlock(>poll_lock);
+   } else {
+   if (front_add)
+   list_add(>ki_list, >req_list);
+   else
+   list_add_tail(>ki_list, >req_list);
+   if (++state->req_count >= AIO_IOPOLL_BATCH)
+   aio_flush_state_reqs(ctx, state);
+   }
 }
 
 static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb,
@@ -2207,7 +2255,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const 
struct iocb *iocb)
 }
 
 static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
-  struct iocb __user *user_iocb, bool compat)
+  struct iocb __user *user_iocb,
+  struct aio_submit_state *state, bool compat)
 {
struct aio_kiocb *req;
ssize_t ret;
@@ -2311,7 +2360,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
ret = -EAGAIN;
goto out_put_req;
}
-   aio_iopoll_iocb_issued(req);
+   aio_iopoll_iocb_issued(state, req);
}
return 0;
 out_put_req:
@@ -2325,7 +2374,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-bool compat)
+struct aio_submit_state *state, bool compat)
 {

[PATCH 12/26] aio: abstract out io_event filler helper

2018-12-07 Thread Jens Axboe

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 06c8bcc72496..173f1f79dc8f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1063,6 +1063,15 @@ static inline void iocb_put(struct aio_kiocb *iocb)
}
 }
 
+static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+  long res, long res2)
+{
+   ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+   ev->data = iocb->ki_user_data;
+   ev->res = res;
+   ev->res2 = res2;
+}
+
 /* aio_complete
  * Called when the io request on the given iocb is complete.
  */
@@ -1090,10 +1099,7 @@ static void aio_complete(struct aio_kiocb *iocb, long 
res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 
-   event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
-   event->data = iocb->ki_user_data;
-   event->res = res;
-   event->res2 = res2;
+   aio_fill_event(event, iocb, res, res2);
 
kunmap_atomic(ev_page);
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
-- 
2.17.1

[PATCH 14/26] aio: add support for having user mapped iocbs

2018-12-07 Thread Jens Axboe

For io_submit(), we have to first copy each pointer to an iocb, then
copy the iocb. The latter is 64 bytes in size, and that's a lot of
copying for a single IO.

Add support for setting IOCTX_FLAG_USERIOCB through the new io_setup2()
system call, which allows the iocbs to reside in userspace. If this flag
is used, then io_submit() doesn't take pointers to iocbs anymore, it
takes an index value into the array of iocbs instead. Similary, for
io_getevents(), the iocb ->obj will be the index, not the pointer to the
iocb.

See the change made to fio to support this feature, it's pretty
trivialy to adapt to. For applications, like fio, that previously
embedded the iocb inside a application private structure, some sort
of lookup table/structure is needed to find the private IO structure
from the index at io_getevents() time.

http://git.kernel.dk/cgit/fio/commit/?id=3c3168e91329c83880c91e5abc28b9d6b940fd95

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 126 +++
 include/uapi/linux/aio_abi.h |   2 +
 2 files changed, 116 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 26631d6872d2..bb6f07ca6940 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -92,6 +92,11 @@ struct ctx_rq_wait {
atomic_t count;
 };
 
+struct aio_mapped_range {
+   struct page **pages;
+   long nr_pages;
+};
+
 struct kioctx {
struct percpu_ref   users;
atomic_tdead;
@@ -127,6 +132,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
+   struct aio_mapped_range iocb_range;
+
struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
@@ -222,6 +229,11 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
+static const unsigned int iocb_page_shift =
+   ilog2(PAGE_SIZE / sizeof(struct iocb));
+
+static void aio_useriocb_unmap(struct kioctx *);
+
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
struct file *file;
@@ -578,6 +590,7 @@ static void free_ioctx(struct work_struct *work)
  free_rwork);
pr_debug("freeing %p\n", ctx);
 
+   aio_useriocb_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(>reqs);
@@ -1288,6 +1301,70 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
return ret;
 }
 
+static struct iocb *aio_iocb_from_index(struct kioctx *ctx, int index)
+{
+   struct iocb *iocb;
+
+   iocb = page_address(ctx->iocb_range.pages[index >> iocb_page_shift]);
+   index &= ((1 << iocb_page_shift) - 1);
+   return iocb + index;
+}
+
+static void aio_unmap_range(struct aio_mapped_range *range)
+{
+   int i;
+
+   if (!range->nr_pages)
+   return;
+
+   for (i = 0; i < range->nr_pages; i++)
+   put_page(range->pages[i]);
+
+   kfree(range->pages);
+   range->pages = NULL;
+   range->nr_pages = 0;
+}
+
+static int aio_map_range(struct aio_mapped_range *range, void __user *uaddr,
+size_t size, int gup_flags)
+{
+   int nr_pages, ret;
+
+   if ((unsigned long) uaddr & ~PAGE_MASK)
+   return -EINVAL;
+
+   nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+   range->pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+   if (!range->pages)
+   return -ENOMEM;
+
+   down_write(>mm->mmap_sem);
+   ret = get_user_pages((unsigned long) uaddr, nr_pages, gup_flags,
+   range->pages, NULL);
+   up_write(>mm->mmap_sem);
+
+   if (ret < nr_pages) {
+   kfree(range->pages);
+   return -ENOMEM;
+   }
+
+   range->nr_pages = nr_pages;
+   return 0;
+}
+
+static void aio_useriocb_unmap(struct kioctx *ctx)
+{
+   aio_unmap_range(>iocb_range);
+}
+
+static int aio_useriocb_map(struct kioctx *ctx, struct iocb __user *iocbs)
+{
+   size_t size = sizeof(struct iocb) * ctx->max_reqs;
+
+   return aio_map_range(>iocb_range, iocbs, size, 0);
+}
+
 SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, struct iocb __user *,
iocbs, void __user *, user1, void __user *, user2,
aio_context_t __user *, ctxp)
@@ -1296,7 +1373,9 @@ SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, 
struct iocb __user *,
unsigned long ctx;
long ret;
 
-   if (flags || user1 || user2)
+   if (user1 || user2)
+   return -EINVAL;
+   if (flags & ~IOCTX_FLAG_USERIOCB)
return -EINVAL;
 
ret = get_user(ctx, ctxp);
@@ -1308,9 +1387,17 @@ SYSCALL_DEFINE6(io_setup2, u32, nr_eve

[PATCH 11/26] aio: split out iocb copy from io_submit_one()

2018-12-07 Thread Jens Axboe

In preparation of handing in iocbs in a different fashion as well. Also
make it clear that the iocb being passed in isn't modified, by marking
it const throughout.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 68 +++-
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index cf93b92bfb1e..06c8bcc72496 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1420,7 +1420,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long 
res, long res2)
aio_complete(iocb, res, res2);
 }
 
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 {
int ret;
 
@@ -1461,7 +1461,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb 
*iocb)
return ret;
 }
 
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
+static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
 {
void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
@@ -1500,8 +1500,8 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t 
ret)
}
 }
 
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
-   bool compat)
+static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+   bool vectored, bool compat)
 {
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1533,8 +1533,8 @@ static ssize_t aio_read(struct kiocb *req, struct iocb 
*iocb, bool vectored,
return ret;
 }
 
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
-   bool compat)
+static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+bool vectored, bool compat)
 {
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1589,7 +1589,8 @@ static void aio_fsync_work(struct work_struct *work)
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
 }
 
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+bool datasync)
 {
if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
iocb->aio_rw_flags))
@@ -1717,7 +1718,7 @@ aio_poll_queue_proc(struct file *file, struct 
wait_queue_head *head,
add_wait_queue(head, >iocb->poll.wait);
 }
 
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 {
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = >poll;
@@ -1789,27 +1790,23 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct 
iocb *iocb)
return 0;
 }
 
-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-bool compat)
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+  struct iocb __user *user_iocb, bool compat)
 {
struct aio_kiocb *req;
-   struct iocb iocb;
ssize_t ret;
 
-   if (unlikely(copy_from_user(, user_iocb, sizeof(iocb
-   return -EFAULT;
-
/* enforce forwards compatibility on users */
-   if (unlikely(iocb.aio_reserved2)) {
+   if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
 
/* prevent overflows */
if (unlikely(
-   (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
-   (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
-   ((ssize_t)iocb.aio_nbytes < 0)
+   (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+   (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+   ((ssize_t)iocb->aio_nbytes < 0)
   )) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
@@ -1823,14 +1820,14 @@ static int io_submit_one(struct kioctx *ctx, struct 
iocb __user *user_iocb,
if (unlikely(!req))
goto out_put_reqs_available;
 
-   if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+   if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
 * instance of the file* now. The file descriptor must be
 * an eventfd() fd, and will be signaled for each completed
 * event using the eventfd_signal() function.
 */
-   req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
+   req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);

[PATCH 08/26] aio: don't zero entire aio_kiocb aio_get_req()

2018-12-07 Thread Jens Axboe

It's 192 bytes, fairly substantial. Most items don't need to be cleared,
especially not upfront. Clear the ones we do need to clear, and leave
the other ones for setup when the iocb is prepared and submitted.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index eaceb40e6cf5..522c04864d82 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1009,14 +1009,15 @@ static inline struct aio_kiocb *aio_get_req(struct 
kioctx *ctx)
 {
struct aio_kiocb *req;
 
-   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
if (unlikely(!req))
return NULL;
 
percpu_ref_get(>reqs);
+   req->ki_ctx = ctx;
INIT_LIST_HEAD(>ki_list);
refcount_set(>ki_refcnt, 0);
-   req->ki_ctx = ctx;
+   req->ki_eventfd = NULL;
return req;
 }
 
@@ -1730,6 +1731,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct 
iocb *iocb)
if (unlikely(!req->file))
return -EBADF;
 
+   req->head = NULL;
+   req->woken = false;
+   req->cancelled = false;
+
apt.pt._qproc = aio_poll_queue_proc;
apt.pt._key = req->events;
apt.iocb = aiocb;
-- 
2.17.1

[PATCH 13/26] aio: add io_setup2() system call

2018-12-07 Thread Jens Axboe

This is just like io_setup(), except add a flags argument to let the
caller control/define some of the io_context behavior.

Outside of the flags, we add an iocb array and two user pointers for
future use.

Signed-off-by: Jens Axboe 
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/aio.c   | 69 --
 include/linux/syscalls.h   |  3 ++
 include/uapi/asm-generic/unistd.h  |  4 +-
 kernel/sys_ni.c|  1 +
 5 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..67c357225fb0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332common  statx   __x64_sys_statx
 333common  io_pgetevents   __x64_sys_io_pgetevents
 334common  rseq__x64_sys_rseq
+335common  io_setup2   __x64_sys_io_setup2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index 173f1f79dc8f..26631d6872d2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -100,6 +100,8 @@ struct kioctx {
 
unsigned long   user_id;
 
+   unsigned intflags;
+
struct __percpu kioctx_cpu *cpu;
 
/*
@@ -686,10 +688,8 @@ static void aio_nr_sub(unsigned nr)
spin_unlock(_nr_lock);
 }
 
-/* ioctx_alloc
- * Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
- */
-static struct kioctx *ioctx_alloc(unsigned nr_events)
+static struct kioctx *io_setup_flags(unsigned long ctxid,
+unsigned int nr_events, unsigned int flags)
 {
struct mm_struct *mm = current->mm;
struct kioctx *ctx;
@@ -701,6 +701,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 */
unsigned int max_reqs = nr_events;
 
+   if (unlikely(ctxid || nr_events == 0)) {
+   pr_debug("EINVAL: ctx %lu nr_events %u\n",
+ctxid, nr_events);
+   return ERR_PTR(-EINVAL);
+   }
+
/*
 * We keep track of the number of available ringbuffer slots, to prevent
 * overflow (reqs_available), and we also use percpu counters for this.
@@ -726,6 +732,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
if (!ctx)
return ERR_PTR(-ENOMEM);
 
+   ctx->flags = flags;
ctx->max_reqs = max_reqs;
 
spin_lock_init(>ctx_lock);
@@ -1281,6 +1288,34 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
return ret;
 }
 
+SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, struct iocb __user *,
+   iocbs, void __user *, user1, void __user *, user2,
+   aio_context_t __user *, ctxp)
+{
+   struct kioctx *ioctx;
+   unsigned long ctx;
+   long ret;
+
+   if (flags || user1 || user2)
+   return -EINVAL;
+
+   ret = get_user(ctx, ctxp);
+   if (unlikely(ret))
+   goto out;
+
+   ioctx = io_setup_flags(ctx, nr_events, flags);
+   ret = PTR_ERR(ioctx);
+   if (IS_ERR(ioctx))
+   goto out;
+
+   ret = put_user(ioctx->user_id, ctxp);
+   if (ret)
+   kill_ioctx(current->mm, ioctx, NULL);
+   percpu_ref_put(>users);
+out:
+   return ret;
+}
+
 /* sys_io_setup:
  * Create an aio_context capable of receiving at least nr_events.
  * ctxp must not point to an aio_context that already exists, and
@@ -1296,7 +1331,7 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
  */
 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
-   struct kioctx *ioctx = NULL;
+   struct kioctx *ioctx;
unsigned long ctx;
long ret;
 
@@ -1304,14 +1339,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
if (unlikely(ret))
goto out;
 
-   ret = -EINVAL;
-   if (unlikely(ctx || nr_events == 0)) {
-   pr_debug("EINVAL: ctx %lu nr_events %u\n",
-ctx, nr_events);
-   goto out;
-   }
-
-   ioctx = ioctx_alloc(nr_events);
+   ioctx = io_setup_flags(ctx, nr_events, 0);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
@@ -1327,7 +1355,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
 {
-   struct kioctx *ioctx = NULL;
+   struct kioctx *ioctx;
unsigned long ctx;
long ret;
 
@@ -1335,23 +1363,14 @@ COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
u32 __user *, ctx32p)
if (unlikely(ret))
goto out;
 
-

[PATCH 15/26] aio: support for IO polling

2018-12-07 Thread Jens Axboe

Add polled variants of PREAD/PREADV and PWRITE/PWRITEV. These act
like their non-polled counterparts, except we expect to poll for
completion of them. The polling happens at io_getevent() time, and
works just like non-polled IO.

To setup an io_context for polled IO, the application must call
io_setup2() with IOCTX_FLAG_IOPOLL as one of the flags. It is illegal
to mix and match polled and non-polled IO on an io_context.

Polled IO doesn't support the user mapped completion ring. Events
must be reaped through the io_getevents() system call. For non-irq
driven poll devices, there's no way to support completion reaping
from userspace by just looking at the ring. The application itself
is the one that pulls completion entries.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 396 +++
 include/uapi/linux/aio_abi.h |   3 +
 2 files changed, 363 insertions(+), 36 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index bb6f07ca6940..cc8763b395c1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -153,6 +153,18 @@ struct kioctx {
atomic_treqs_available;
} cacheline_aligned_in_smp;
 
+   /* iopoll submission state */
+   struct {
+   spinlock_t poll_lock;
+   struct list_head poll_submitted;
+   } cacheline_aligned_in_smp;
+
+   /* iopoll completion state */
+   struct {
+   struct list_head poll_completing;
+   struct mutex getevents_lock;
+   } cacheline_aligned_in_smp;
+
struct {
spinlock_t  ctx_lock;
struct list_head active_reqs;   /* used for cancellation */
@@ -205,14 +217,27 @@ struct aio_kiocb {
__u64   ki_user_data;   /* user's data for completion */
 
struct list_headki_list;/* the aio core uses this
-* for cancellation */
+* for cancellation, or for
+* polled IO */
+
+   unsigned long   ki_flags;
+#define IOCB_POLL_COMPLETED0   /* polled IO has completed */
+#define IOCB_POLL_EAGAIN   1   /* polled submission got EAGAIN */
+
refcount_t  ki_refcnt;
 
-   /*
-* If the aio_resfd field of the userspace iocb is not zero,
-* this is the underlying eventfd context to deliver events to.
-*/
-   struct eventfd_ctx  *ki_eventfd;
+   union {
+   /*
+* If the aio_resfd field of the userspace iocb is not zero,
+* this is the underlying eventfd context to deliver events to.
+*/
+   struct eventfd_ctx  *ki_eventfd;
+
+   /*
+* For polled IO, stash completion info here
+*/
+   struct io_event ki_ev;
+   };
 };
 
 /*-- sysctl variables*/
@@ -233,6 +258,7 @@ static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
 
 static void aio_useriocb_unmap(struct kioctx *);
+static void aio_iopoll_reap_events(struct kioctx *);
 
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
@@ -471,11 +497,15 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned 
int nr_events)
int i;
struct file *file;
 
-   /* Compensate for the ring buffer's head/tail overlap entry */
-   nr_events += 2; /* 1 is required, 2 for good luck */
-
+   /*
+* Compensate for the ring buffer's head/tail overlap entry.
+* IO polling doesn't require any io event entries
+*/
size = sizeof(struct aio_ring);
-   size += sizeof(struct io_event) * nr_events;
+   if (!(ctx->flags & IOCTX_FLAG_IOPOLL)) {
+   nr_events += 2; /* 1 is required, 2 for good luck */
+   size += sizeof(struct io_event) * nr_events;
+   }
 
nr_pages = PFN_UP(size);
if (nr_pages < 0)
@@ -758,6 +788,11 @@ static struct kioctx *io_setup_flags(unsigned long ctxid,
 
INIT_LIST_HEAD(>active_reqs);
 
+   spin_lock_init(>poll_lock);
+   INIT_LIST_HEAD(>poll_submitted);
+   INIT_LIST_HEAD(>poll_completing);
+   mutex_init(>getevents_lock);
+
if (percpu_ref_init(>users, free_ioctx_users, 0, GFP_KERNEL))
goto err;
 
@@ -829,11 +864,15 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
 {
struct kioctx_table *table;
 
+   mutex_lock(>getevents_lock);
spin_lock(>ioctx_lock);
if (atomic_xchg(>dead, 1)) {
spin_unlock(>ioctx_lock);
+   mutex_unlock(>getevents_lock);
return -EINVAL;
}
+   aio_iopoll_reap_events(ctx);
+   mutex_unlock(>getevents_lock);
 
table = rcu_dereference_raw(mm->ioctx_table)

[PATCH 07/26] aio: separate out ring reservation from req allocation

2018-12-07 Thread Jens Axboe

From: Christoph Hellwig 

This is in preparation for certain types of IO not needing a ring
reserveration.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index cf0de61743e8..eaceb40e6cf5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -901,7 +901,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned 
nr)
local_irq_restore(flags);
 }
 
-static bool get_reqs_available(struct kioctx *ctx)
+static bool __get_reqs_available(struct kioctx *ctx)
 {
struct kioctx_cpu *kcpu;
bool ret = false;
@@ -993,6 +993,14 @@ static void user_refill_reqs_available(struct kioctx *ctx)
spin_unlock_irq(>completion_lock);
 }
 
+static bool get_reqs_available(struct kioctx *ctx)
+{
+   if (__get_reqs_available(ctx))
+   return true;
+   user_refill_reqs_available(ctx);
+   return __get_reqs_available(ctx);
+}
+
 /* aio_get_req
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -1001,24 +1009,15 @@ static inline struct aio_kiocb *aio_get_req(struct 
kioctx *ctx)
 {
struct aio_kiocb *req;
 
-   if (!get_reqs_available(ctx)) {
-   user_refill_reqs_available(ctx);
-   if (!get_reqs_available(ctx))
-   return NULL;
-   }
-
req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
if (unlikely(!req))
-   goto out_put;
+   return NULL;
 
percpu_ref_get(>reqs);
INIT_LIST_HEAD(>ki_list);
refcount_set(>ki_refcnt, 0);
req->ki_ctx = ctx;
return req;
-out_put:
-   put_reqs_available(ctx, 1);
-   return NULL;
 }
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
@@ -1805,9 +1804,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
return -EINVAL;
}
 
+   if (!get_reqs_available(ctx))
+   return -EAGAIN;
+
+   ret = -EAGAIN;
req = aio_get_req(ctx);
if (unlikely(!req))
-   return -EAGAIN;
+   goto out_put_reqs_available;
 
if (iocb.aio_flags & IOCB_FLAG_RESFD) {
/*
@@ -1870,11 +1873,12 @@ static int io_submit_one(struct kioctx *ctx, struct 
iocb __user *user_iocb,
goto out_put_req;
return 0;
 out_put_req:
-   put_reqs_available(ctx, 1);
percpu_ref_put(>reqs);
if (req->ki_eventfd)
eventfd_ctx_put(req->ki_eventfd);
kmem_cache_free(kiocb_cachep, req);
+out_put_reqs_available:
+   put_reqs_available(ctx, 1);
return ret;
 }
 
-- 
2.17.1

[PATCH 05/26] iomap: wire up the iopoll method

2018-12-07 Thread Jens Axboe

From: Christoph Hellwig 

Store the request queue the last bio was submitted to in the iocb
private data in addition to the cookie so that we find the right block
device.  Also refactor the common direct I/O bio submission code into a
nice little helper.

Signed-off-by: Christoph Hellwig 

Modified to use REQ_HIPRI_ASYNC for async polled IO.

Signed-off-by: Jens Axboe 
---
 fs/gfs2/file.c|  2 ++
 fs/iomap.c| 47 +--
 fs/xfs/xfs_file.c |  1 +
 include/linux/iomap.h |  1 +
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 45a17b770d97..358157efc5b7 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1280,6 +1280,7 @@ const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read_iter  = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+   .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap   = gfs2_mmap,
.open   = gfs2_open,
@@ -1310,6 +1311,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
.read_iter  = gfs2_file_read_iter,
.write_iter = gfs2_file_write_iter,
+   .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
.mmap   = gfs2_mmap,
.open   = gfs2_open,
diff --git a/fs/iomap.c b/fs/iomap.c
index d094e5688bd3..bd483fcb7b5a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1441,6 +1441,32 @@ struct iomap_dio {
};
 };
 
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+   struct request_queue *q = READ_ONCE(kiocb->private);
+
+   if (!q)
+   return 0;
+   return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+   struct bio *bio)
+{
+   atomic_inc(>ref);
+
+   if (dio->iocb->ki_flags & IOCB_HIPRI) {
+   if (!dio->wait_for_completion)
+   bio->bi_opf |= REQ_HIPRI_ASYNC;
+   else
+   bio->bi_opf |= REQ_HIPRI;
+   }
+
+   dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+   dio->submit.cookie = submit_bio(bio);
+}
+
 static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
struct kiocb *iocb = dio->iocb;
@@ -1553,7 +1579,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
}
 }
 
-static blk_qc_t
+static void
 iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
unsigned len)
 {
@@ -1567,15 +1593,10 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap 
*iomap, loff_t pos,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
 
-   if (dio->iocb->ki_flags & IOCB_HIPRI)
-   flags |= REQ_HIPRI;
-
get_page(page);
__bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-
-   atomic_inc(>ref);
-   return submit_bio(bio);
+   iomap_dio_submit_bio(dio, iomap, bio);
 }
 
 static loff_t
@@ -1678,9 +1699,6 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
loff_t length,
bio_set_pages_dirty(bio);
}
 
-   if (dio->iocb->ki_flags & IOCB_HIPRI)
-   bio->bi_opf |= REQ_HIPRI;
-
iov_iter_advance(dio->submit.iter, n);
 
dio->size += n;
@@ -1688,11 +1706,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
loff_t length,
copied += n;
 
nr_pages = iov_iter_npages(, BIO_MAX_PAGES);
-
-   atomic_inc(>ref);
-
-   dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-   dio->submit.cookie = submit_bio(bio);
+   iomap_dio_submit_bio(dio, iomap, bio);
} while (nr_pages);
 
/*
@@ -1912,6 +1926,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (dio->flags & IOMAP_DIO_WRITE_FUA)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
 
+   WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+   WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
if (!atomic_dec_and_test(>ref)) {
if (!dio->wait_for_completion)
return -EIOCBQUEUED;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..60c2da41f0fc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1203,6 +1203,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read= generic_file_splice_read,
.splice_write   = iter_file_splice_write,
+   .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = x

[PATCH 09/26] aio: only use blk plugs for > 2 depth submissions

2018-12-07 Thread Jens Axboe

Plugging is meant to optimize submission of a string of IOs, if we don't
have more than 2 being submitted, don't bother setting up a plug.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 522c04864d82..ed6c3914477a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -69,6 +69,12 @@ struct aio_ring {
struct io_event io_events[0];
 }; /* 128 bytes + ring size */
 
+/*
+ * Plugging is meant to work with larger batches of IOs. If we don't
+ * have more than the below, then don't bother setting up a plug.
+ */
+#define AIO_PLUG_THRESHOLD 2
+
 #define AIO_RING_PAGES 8
 
 struct kioctx_table {
@@ -1919,7 +1925,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, 
nr,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
 
-   blk_start_plug();
+   if (nr > AIO_PLUG_THRESHOLD)
+   blk_start_plug();
for (i = 0; i < nr; i++) {
struct iocb __user *user_iocb;
 
@@ -1932,7 +1939,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, 
nr,
if (ret)
break;
}
-   blk_finish_plug();
+   if (nr > AIO_PLUG_THRESHOLD)
+   blk_finish_plug();
 
percpu_ref_put(>users);
return i ? i : ret;
@@ -1959,7 +1967,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, 
ctx_id,
if (nr > ctx->nr_events)
nr = ctx->nr_events;
 
-   blk_start_plug();
+   if (nr > AIO_PLUG_THRESHOLD)
+   blk_start_plug();
for (i = 0; i < nr; i++) {
compat_uptr_t user_iocb;
 
@@ -1972,7 +1981,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, 
ctx_id,
if (ret)
break;
}
-   blk_finish_plug();
+   if (nr > AIO_PLUG_THRESHOLD)
+   blk_finish_plug();
 
percpu_ref_put(>users);
return i ? i : ret;
-- 
2.17.1

[PATCH 10/26] aio: use iocb_put() instead of open coding it

2018-12-07 Thread Jens Axboe

Replace the percpu_ref_put() + kmem_cache_free() with a call to
iocb_put() instead.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index ed6c3914477a..cf93b92bfb1e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1884,10 +1884,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
goto out_put_req;
return 0;
 out_put_req:
-   percpu_ref_put(>reqs);
if (req->ki_eventfd)
eventfd_ctx_put(req->ki_eventfd);
-   kmem_cache_free(kiocb_cachep, req);
+   iocb_put(req);
 out_put_reqs_available:
put_reqs_available(ctx, 1);
return ret;
-- 
2.17.1

[PATCH 04/26] block: use REQ_HIPRI_ASYNC for non-sync polled IO

2018-12-07 Thread Jens Axboe

Tell the block layer if it's a sync or async polled request, so it can
do the right thing.

Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6de8d35f6e41..b8f574615792 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -402,8 +402,12 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
 
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
-   if (iocb->ki_flags & IOCB_HIPRI)
-   bio->bi_opf |= REQ_HIPRI;
+   if (iocb->ki_flags & IOCB_HIPRI) {
+   if (!is_sync)
+   bio->bi_opf |= REQ_HIPRI_ASYNC;
+   else
+   bio->bi_opf |= REQ_HIPRI;
+   }
 
qc = submit_bio(bio);
WRITE_ONCE(iocb->ki_cookie, qc);
-- 
2.17.1

[PATCH 06/26] aio: use assigned completion handler

2018-12-07 Thread Jens Axboe

We know this is a read/write request, but in preparation for
having different kinds of those, ensure that we call the assigned
handler instead of assuming it's aio_complete_rq().

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index 05647d352bf3..cf0de61743e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1490,7 +1490,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t 
ret)
ret = -EINTR;
/*FALLTHRU*/
default:
-   aio_complete_rw(req, ret, 0);
+   req->ki_complete(req, ret, 0);
}
 }
 
-- 
2.17.1

[PATCH 01/26] fs: add an iopoll method to struct file_operations

2018-12-07 Thread Jens Axboe

From: Christoph Hellwig 

This new methods is used to explicitly poll for I/O completion for an
iocb.  It must be called for any iocb submitted asynchronously (that
is with a non-null ki_complete) which has the IOCB_HIPRI flag set.

The method is assisted by a new ki_cookie field in struct iocb to store
the polling cookie.

TODO: we can probably union ki_cookie with the existing hint and I/O
priority fields to avoid struct kiocb growth.

Reviewed-by: Johannes Thumshirn 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 Documentation/filesystems/vfs.txt | 3 +++
 include/linux/fs.h| 2 ++
 2 files changed, 5 insertions(+)

diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 5f71a252e2e0..d9dc5e4d82b9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+   int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -902,6 +903,8 @@ otherwise noted.
 
   write_iter: possibly asynchronous write with iov_iter as source
 
+  iopoll: called when aio wants to poll for completions on HIPRI iocbs
+
   iterate: called when the VFS needs to read the directory contents
 
   iterate_shared: called when the VFS needs to read the directory contents
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1ab233e6469..6a5f71f8ae06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -310,6 +310,7 @@ struct kiocb {
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
+   unsigned intki_cookie; /* for ->iopoll */
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1781,6 +1782,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+   int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
-- 
2.17.1

[PATCHSET v6] Support for polled aio (and more)

2018-12-07 Thread Jens Axboe

For the grand introduction to this feature, see my original posting
here:

https://lore.kernel.org/linux-block/20181117235317.7366-1-ax...@kernel.dk/

and refer to the previous postings of this patchset for whatever
features were added there. Particularly v4 has some performance results:

https://lore.kernel.org/linux-block/20181130165646.27341-1-ax...@kernel.dk/

No new features in this version, I think we've got what we need. Just
a collection of fixes. Especially interesting is probably that XFS is
tested in all configurations now, it works with both IO polling and
fixed user buffers.

Find it here for your cgit browsing:

http://git.kernel.dk/cgit/linux-block/log/?h=aio-poll

or clone/pull from this branch:

git://git.kernel.dk/linux-block aio-poll


Since v5

- Correct spin/no-spin condition for polling
- Add some comments
- Fix missing io_event_ring(2) if !CONFIG_AIO
- Make sure XFS works for all cases
- Fix cq/sq ring sizing
- Make cq/sq ring work with non-polled IO
- Make cq/sq ring work without fixed buffers
- Ensure that io_ring_enter() doesn't race with kill_ioctx()
- Rebase on top of for-4.21/block


 Documentation/filesystems/vfs.txt  |3 +
 arch/x86/entry/syscalls/syscall_64.tbl |2 +
 block/bio.c|   33 +-
 fs/aio.c   | 1538 +---
 fs/block_dev.c |   34 +-
 fs/file.c  |   15 +-
 fs/file_table.c|   10 +-
 fs/gfs2/file.c |2 +
 fs/iomap.c |   57 +-
 fs/xfs/xfs_file.c  |1 +
 include/linux/bio.h|1 +
 include/linux/blk_types.h  |2 +
 include/linux/file.h   |2 +
 include/linux/fs.h |5 +-
 include/linux/iomap.h  |1 +
 include/linux/syscalls.h   |5 +
 include/uapi/asm-generic/unistd.h  |4 +-
 include/uapi/linux/aio_abi.h   |   32 +
 kernel/sys_ni.c|2 +
 19 files changed, 1552 insertions(+), 197 deletions(-)

-- 
Jens Axboe

[PATCH 02/26] block: add REQ_HIPRI_ASYNC

2018-12-07 Thread Jens Axboe

For the upcoming async polled IO, we can't sleep allocating requests.
If we do, then we introduce a deadlock where the submitter already
has async polled IO in-flight, but can't wait for them to complete
since polled requests must be active found and reaped.

Signed-off-by: Jens Axboe 
---
 include/linux/blk_types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 46c005d601ac..921d734d6b5d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -347,6 +347,7 @@ enum req_flag_bits {
 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
 #define REQ_NOUNMAP(1ULL << __REQ_NOUNMAP)
 #define REQ_HIPRI  (1ULL << __REQ_HIPRI)
+#define REQ_HIPRI_ASYNC(REQ_HIPRI | REQ_NOWAIT)
 
 #define REQ_DRV(1ULL << __REQ_DRV)
 #define REQ_SWAP   (1ULL << __REQ_SWAP)
-- 
2.17.1

[PATCH 03/26] block: wire up block device iopoll method

2018-12-07 Thread Jens Axboe

From: Christoph Hellwig 

Just call blk_poll on the iocb cookie, we can derive the block device
from the inode trivially.

Reviewed-by: Johannes Thumshirn 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e1886cc7048f..6de8d35f6e41 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -281,6 +281,14 @@ struct blkdev_dio {
 
 static struct bio_set blkdev_dio_pool;
 
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+   struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+   struct request_queue *q = bdev_get_queue(bdev);
+
+   return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
struct blkdev_dio *dio = bio->bi_private;
@@ -398,6 +406,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
bio->bi_opf |= REQ_HIPRI;
 
qc = submit_bio(bio);
+   WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
 
@@ -2070,6 +2079,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter  = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+   .iopoll = blkdev_iopoll,
.mmap   = generic_file_mmap,
.fsync  = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
-- 
2.17.1

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 3:00 PM, Jens Axboe wrote:
> On 12/7/18 2:59 PM, Jens Axboe wrote:
>> On 12/7/18 2:58 PM, Jens Axboe wrote:
>>> On 12/7/18 12:35 PM, Jens Axboe wrote:
>>>> On 12/7/18 12:34 PM, Jeff Moyer wrote:
>>>>> Jens Axboe  writes:
>>>>>
>>>>>> BTW, quick guess is that it doesn't work so well with fixed buffers, as 
>>>>>> that
>>>>>> hasn't been tested. You could try and remove IOCTX_FLAG_FIXEDBUFS from 
>>>>>> the
>>>>>> test program and see if that works.
>>>>>
>>>>> That results in a NULL pointer dereference.  I'll stick to block device
>>>>> testing for now.  :)
>>>>
>>>> Good plan :-)
>>>
>>> Took a look and fixed it up, my aio-poll branch has a fixup patch and I
>>> also updated an updated aio-ring.c in the same location.
>>>
>>> Tested XFS with polling on and off, works for me.
>>>
>>> Will take a look at FIXEDBUFS now. Note that aio-ring has some parameters
>>> at the top:
>>>
>>> static int polled = 1;  /* use IO polling */
>>> static int fixedbufs = 0;   /* use fixed user buffers */
>>>
>>> that you can change to try the various modes. Don't use fixedbufs = 1
>>> with XFS just yet :-)
>>
>> Actually, with the other fixups, that works now too. At least in a quick
>> test...
> 
> And no, it does not ;-). Will take a look, we're dropping page counts where
> we should not be.

Forgot to add the BIO_HOLD_PAGES check in iomap, it works now. Pushed out
that fix, will also need folding.

-- 
Jens Axboe

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 2:59 PM, Jens Axboe wrote:
> On 12/7/18 2:58 PM, Jens Axboe wrote:
>> On 12/7/18 12:35 PM, Jens Axboe wrote:
>>> On 12/7/18 12:34 PM, Jeff Moyer wrote:
>>>> Jens Axboe  writes:
>>>>
>>>>> BTW, quick guess is that it doesn't work so well with fixed buffers, as 
>>>>> that
>>>>> hasn't been tested. You could try and remove IOCTX_FLAG_FIXEDBUFS from the
>>>>> test program and see if that works.
>>>>
>>>> That results in a NULL pointer dereference.  I'll stick to block device
>>>> testing for now.  :)
>>>
>>> Good plan :-)
>>
>> Took a look and fixed it up, my aio-poll branch has a fixup patch and I
>> also updated an updated aio-ring.c in the same location.
>>
>> Tested XFS with polling on and off, works for me.
>>
>> Will take a look at FIXEDBUFS now. Note that aio-ring has some parameters
>> at the top:
>>
>> static int polled = 1;  /* use IO polling */
>> static int fixedbufs = 0;   /* use fixed user buffers */
>>
>> that you can change to try the various modes. Don't use fixedbufs = 1
>> with XFS just yet :-)
> 
> Actually, with the other fixups, that works now too. At least in a quick
> test...

And no, it does not ;-). Will take a look, we're dropping page counts where
we should not be.

-- 
Jens Axboe

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 2:58 PM, Jens Axboe wrote:
> On 12/7/18 12:35 PM, Jens Axboe wrote:
>> On 12/7/18 12:34 PM, Jeff Moyer wrote:
>>> Jens Axboe  writes:
>>>
>>>> BTW, quick guess is that it doesn't work so well with fixed buffers, as 
>>>> that
>>>> hasn't been tested. You could try and remove IOCTX_FLAG_FIXEDBUFS from the
>>>> test program and see if that works.
>>>
>>> That results in a NULL pointer dereference.  I'll stick to block device
>>> testing for now.  :)
>>
>> Good plan :-)
> 
> Took a look and fixed it up, my aio-poll branch has a fixup patch and I
> also updated an updated aio-ring.c in the same location.
> 
> Tested XFS with polling on and off, works for me.
> 
> Will take a look at FIXEDBUFS now. Note that aio-ring has some parameters
> at the top:
> 
> static int polled = 1;  /* use IO polling */
> static int fixedbufs = 0;   /* use fixed user buffers */
> 
> that you can change to try the various modes. Don't use fixedbufs = 1
> with XFS just yet :-)

Actually, with the other fixups, that works now too. At least in a quick
test...

-- 
Jens Axboe

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 12:35 PM, Jens Axboe wrote:
> On 12/7/18 12:34 PM, Jeff Moyer wrote:
>> Jens Axboe  writes:
>>
>>> BTW, quick guess is that it doesn't work so well with fixed buffers, as that
>>> hasn't been tested. You could try and remove IOCTX_FLAG_FIXEDBUFS from the
>>> test program and see if that works.
>>
>> That results in a NULL pointer dereference.  I'll stick to block device
>> testing for now.  :)
> 
> Good plan :-)

Took a look and fixed it up, my aio-poll branch has a fixup patch and I
also updated an updated aio-ring.c in the same location.

Tested XFS with polling on and off, works for me.

Will take a look at FIXEDBUFS now. Note that aio-ring has some parameters
at the top:

static int polled = 1;  /* use IO polling */
static int fixedbufs = 0;   /* use fixed user buffers */

that you can change to try the various modes. Don't use fixedbufs = 1
with XFS just yet :-)

-- 
Jens Axboe

Re: [GIT PULL] first batch of nvme updates for 4.21

2018-12-07 Thread Jens Axboe

On 12/7/18 12:20 PM, Christoph Hellwig wrote:
> Hi Jens,
> 
> please pull this first batch of nvme updates for Linux 4.21.
> 
> Highlights:
>  - target support for persistent discovery controllers (Jay Sternberg)
>  - target optimizations to use non-blocking reads (Chaitanya Kulkarni)
>  - host side support for the Enhanced Command Retry TP (Keith Busch)
>  - host and target support for traffic based keep alive (Sagi Grimberg)
>  - host and target support for disabling SQ based flow control (Sagi Grimberg)
> 
> Plus all kinds of optimizations and cleanups from the usual suspects.
> 
> The following changes since commit 43e497feb3987c868f87c7ca1470c5937689fe01:
> 
>   blk-mq: re-build queue map in case of kdump kernel (2018-12-06 22:52:59 
> -0700)
> 
> are available in the Git repository at:
> 
>   git://git.infradead.org/nvme.git nvme-4.21
> 
> for you to fetch changes up to 48fd68b3c5c66af6e0004787a54765a8d040989b:
> 
>   nvme: remove unused function nvme_ctrl_ready (2018-12-07 11:16:38 -0800)
> 
> 
> Chaitanya Kulkarni (4):
>   nvme: consolidate memset calls in the nvme_setup_cmd path
>   nvmet: use IOCB_NOWAIT for file-ns buffered I/O
>   nvmet: use unlikely for req status check
>   nvmet: fix the structure member indentation
> 
> Christoph Hellwig (1):
>   nvmet: mark nvmet_genctr static
> 
> Hannes Reinecke (1):
>   nvme: add a numa_node field to struct nvme_ctrl
> 
> Israel Rukshin (3):
>   nvme: Remove unused forward declaration
>   nvmet-rdma: Add unlikely for response allocated check
>   nvme: remove unused function nvme_ctrl_ready
> 
> James Smart (1):
>   nvmet-fc: remove the IN_ISR deferred scheduling options
> 
> Jay Sternberg (7):
>   nvmet: provide aen bit functions for multiple controller types
>   nvmet: change aen mask functions to use bit numbers
>   nvmet: allow Keep Alive for Discovery controller
>   nvmet: make kato and AEN processing for use by other controllers
>   nvmet: add defines for discovery change async events
>   nvmet: add support to Discovery controllers for commands
>   nvmet: enable Discovery Controller AENs
> 
> Keith Busch (1):
>   nvme: implement Enhanced Command Retry
> 
> Sagi Grimberg (9):
>   nvme: introduce ctrl attributes enumeration
>   nvme: cache controller attributes
>   nvme: support traffic based keep-alive
>   nvmet: support for traffic based keep-alive
>   nvmet: allow host connect even if no allowed subsystems are exported
>   nvmet: support fabrics sq flow control
>   nvmet: don't override treq upon modification.
>   nvmet: expose support for fabrics SQ flow control disable in treq
>   nvme: disable fabrics SQ flow control when asked by the user
> 
>  drivers/nvme/host/core.c  |  70 ++--
>  drivers/nvme/host/fabrics.c   |  13 +++-
>  drivers/nvme/host/fabrics.h   |   2 +
>  drivers/nvme/host/fc.c|   5 +-
>  drivers/nvme/host/multipath.c |   4 +-
>  drivers/nvme/host/nvme.h  |  13 ++--
>  drivers/nvme/host/rdma.c  |   5 +-
>  drivers/nvme/target/admin-cmd.c   |  76 --
>  drivers/nvme/target/configfs.c|  42 
>  drivers/nvme/target/core.c| 118 +-
>  drivers/nvme/target/discovery.c   | 129 +++--
>  drivers/nvme/target/fabrics-cmd.c |   6 ++
>  drivers/nvme/target/fc.c  |  66 +--
>  drivers/nvme/target/io-cmd-file.c | 130 
> --
>  drivers/nvme/target/nvmet.h   |  53 +++++---
>  drivers/nvme/target/rdma.c|   2 +-
>  include/linux/nvme-fc-driver.h|  16 -
>  include/linux/nvme.h  |  51 +--
>  18 files changed, 514 insertions(+), 287 deletions(-)

Pulled, thanks.

-- 
Jens Axboe

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 12:34 PM, Jeff Moyer wrote:
> Jens Axboe  writes:
> 
>> BTW, quick guess is that it doesn't work so well with fixed buffers, as that
>> hasn't been tested. You could try and remove IOCTX_FLAG_FIXEDBUFS from the
>> test program and see if that works.
> 
> That results in a NULL pointer dereference.  I'll stick to block device
> testing for now.  :)

Good plan :-)

-- 
Jens Axboe

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 11:52 AM, Jens Axboe wrote:
> On 12/7/18 11:48 AM, Jeff Moyer wrote:
>> Hi, Jens,
>>
>> Jens Axboe  writes:
>>
>>> You can also find the patches in my aio-poll branch:
>>>
>>> http://git.kernel.dk/cgit/linux-block/log/?h=aio-poll
>>>
>>> or by cloning:
>>>
>>> git://git.kernel.dk/linux-block aio-poll
>>
>> I made an xfs file system on a partition of an nvme device.  I created a
>> 1 GB file on that file system, and then ran the test program you
>> provided.  It immediately spewed the following:
>>
>> [  139.533754] BUG: Bad page state in process aio-ring  pfn:5f394f2
>> [  139.539763] page:de6a3ce53c80 count:-38 mapcount:1 
>> mapping: index:0x1
>> [  139.547942] flags: 0x57c000()
>> [  139.551607] raw: 0057c000 dead0100 dead0200 
>> 
>> [  139.559346] raw: 0001  ffda 
>> 
>> [  139.567085] page dumped because: nonzero _refcount
>> [  139.571876] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
>> iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 
>> nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc devlink 
>> ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter sunrpc 
>> dm_mirror dm_region_hash dm_log dm_mod intel_rapl skx_edac 
>> x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass vfat fat 
>> ipmi_ssif crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate 
>> intel_uncore intel_rapl_perf iTCO_wdt iTCO_vendor_support ipmi_si mei_me 
>> pcspkr ipmi_devintf dax_pmem sg i2c_i801 lpc_ich mei device_dax 
>> ipmi_msghandler ioatdma wmi acpi_pad acpi_power_meter dca ip_tables xfs 
>> libcrc32c nd_pmem nd_btt sd_mod sr_mod cdrom ast i2c_algo_bit drm_kms_helper 
>> syscopyarea crc32c_intel sysfillrect sysimgblt i40e fb_sys_fops ttm drm nvme 
>> ahci nvme_core libahci libata nfit libnvdimm
>> [  139.649936] CPU: 46 PID: 13137 Comm: aio-ring Not tainted 4.20.0-rc5+ #1
>> [  139.656635] Hardware name: Intel Corporation ...
>> [  139.667062] Call Trace:
>> [  139.669522]  dump_stack+0x46/0x5b
>> [  139.672845]  bad_page+0xf5/0x10f
>> [  139.676080]  free_pcppages_bulk+0x588/0x5d0
>> [  139.680271]  free_unref_page_list+0xfc/0x170
>> [  139.684545]  release_pages+0x103/0x4a0
>> [  139.688293]  __pagevec_release+0x2b/0x30
>> [  139.692223]  invalidate_inode_pages2_range+0x3ea/0x560
>> [  139.697360]  ? pagevec_lookup_range_tag+0x24/0x30
>> [  139.702068]  ? __filemap_fdatawait_range+0x87/0x150
>> [  139.706951]  iomap_dio_rw+0x1de/0x3b0
>> [  139.710622]  ? current_time+0x4f/0x90
>> [  139.714288]  ? atime_needs_update+0x6c/0xd0
>> [  139.718474]  ? down_read+0xe/0x30
>> [  139.721821]  xfs_file_dio_aio_read+0x65/0xe0 [xfs]
>> [  139.726633]  xfs_file_read_iter+0xba/0xd0 [xfs]
>> [  139.731172]  aio_read+0x13e/0x1b0
>> [  139.734499]  ? iomap_dio_complete+0x98/0x150
>> [  139.738778]  ? put_aio_ring_file.isra.26+0x70/0x70
>> [  139.743568]  ? iomap_dio_complete_work+0x17/0x30
>> [  139.748188]  ? iomap_dio_bio_end_io+0x160/0x180
>> [  139.752720]  ? kmem_cache_alloc+0x170/0x1c0
>> [  139.756907]  __io_submit_one+0x3e0/0xa10
>> [  139.760832]  aio_ring_submit+0xcf/0x1a0
>> [  139.764675]  ? do_page_fault+0x2d/0x120
>> [  139.768512]  ? aio_iopoll_getevents+0x1cf/0x220
>> [  139.773042]  __x64_sys_io_ring_enter+0xf8/0x160
>> [  139.777578]  do_syscall_64+0x55/0x1a0
>> [  139.781243]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>> [  139.786294] RIP: 0033:0x7f3ea04511c9
>> [  139.789873] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 
>> 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 
>> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 97 dc 2c 00 f7 d8 64 89 01 48
>> [  139.808620] RSP: 002b:7f3ea0357e28 EFLAGS: 0246 ORIG_RAX: 
>> 0150
>> [  139.816185] RAX: ffda RBX: 0001 RCX: 
>> 7f3ea04511c9
>> [  139.823317] RDX: 0001 RSI: 0001 RDI: 
>> 7f3ea0d63000
>> [  139.830448] RBP:  R08: 0003 R09: 
>> 0003
>> [  139.837581] R10: 0003 R11: 0246 R12: 
>> 025ea080
>> [  139.844714] R13: 025ea000 R14: 00602120 R15: 
>> 0001
>> [  139.851848] Disabling lock debugging due to kernel taint
>>
>> Followed by this:
>>
>> [  163.724474] watchdog: BUG: soft

Re: [PATCHSET v5] Support for polled aio

2018-12-07 Thread Jens Axboe

On 12/7/18 11:48 AM, Jeff Moyer wrote:
> Hi, Jens,
> 
> Jens Axboe  writes:
> 
>> You can also find the patches in my aio-poll branch:
>>
>> http://git.kernel.dk/cgit/linux-block/log/?h=aio-poll
>>
>> or by cloning:
>>
>> git://git.kernel.dk/linux-block aio-poll
> 
> I made an xfs file system on a partition of an nvme device.  I created a
> 1 GB file on that file system, and then ran the test program you
> provided.  It immediately spewed the following:
> 
> [  139.533754] BUG: Bad page state in process aio-ring  pfn:5f394f2
> [  139.539763] page:de6a3ce53c80 count:-38 mapcount:1 
> mapping: index:0x1
> [  139.547942] flags: 0x57c000()
> [  139.551607] raw: 0057c000 dead0100 dead0200 
> 
> [  139.559346] raw: 0001  ffda 
> 
> [  139.567085] page dumped because: nonzero _refcount
> [  139.571876] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
> iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 
> nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc devlink 
> ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter sunrpc 
> dm_mirror dm_region_hash dm_log dm_mod intel_rapl skx_edac 
> x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass vfat fat 
> ipmi_ssif crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate 
> intel_uncore intel_rapl_perf iTCO_wdt iTCO_vendor_support ipmi_si mei_me 
> pcspkr ipmi_devintf dax_pmem sg i2c_i801 lpc_ich mei device_dax 
> ipmi_msghandler ioatdma wmi acpi_pad acpi_power_meter dca ip_tables xfs 
> libcrc32c nd_pmem nd_btt sd_mod sr_mod cdrom ast i2c_algo_bit drm_kms_helper 
> syscopyarea crc32c_intel sysfillrect sysimgblt i40e fb_sys_fops ttm drm nvme 
> ahci nvme_core libahci libata nfit libnvdimm
> [  139.649936] CPU: 46 PID: 13137 Comm: aio-ring Not tainted 4.20.0-rc5+ #1
> [  139.656635] Hardware name: Intel Corporation ...
> [  139.667062] Call Trace:
> [  139.669522]  dump_stack+0x46/0x5b
> [  139.672845]  bad_page+0xf5/0x10f
> [  139.676080]  free_pcppages_bulk+0x588/0x5d0
> [  139.680271]  free_unref_page_list+0xfc/0x170
> [  139.684545]  release_pages+0x103/0x4a0
> [  139.688293]  __pagevec_release+0x2b/0x30
> [  139.692223]  invalidate_inode_pages2_range+0x3ea/0x560
> [  139.697360]  ? pagevec_lookup_range_tag+0x24/0x30
> [  139.702068]  ? __filemap_fdatawait_range+0x87/0x150
> [  139.706951]  iomap_dio_rw+0x1de/0x3b0
> [  139.710622]  ? current_time+0x4f/0x90
> [  139.714288]  ? atime_needs_update+0x6c/0xd0
> [  139.718474]  ? down_read+0xe/0x30
> [  139.721821]  xfs_file_dio_aio_read+0x65/0xe0 [xfs]
> [  139.726633]  xfs_file_read_iter+0xba/0xd0 [xfs]
> [  139.731172]  aio_read+0x13e/0x1b0
> [  139.734499]  ? iomap_dio_complete+0x98/0x150
> [  139.738778]  ? put_aio_ring_file.isra.26+0x70/0x70
> [  139.743568]  ? iomap_dio_complete_work+0x17/0x30
> [  139.748188]  ? iomap_dio_bio_end_io+0x160/0x180
> [  139.752720]  ? kmem_cache_alloc+0x170/0x1c0
> [  139.756907]  __io_submit_one+0x3e0/0xa10
> [  139.760832]  aio_ring_submit+0xcf/0x1a0
> [  139.764675]  ? do_page_fault+0x2d/0x120
> [  139.768512]  ? aio_iopoll_getevents+0x1cf/0x220
> [  139.773042]  __x64_sys_io_ring_enter+0xf8/0x160
> [  139.777578]  do_syscall_64+0x55/0x1a0
> [  139.781243]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  139.786294] RIP: 0033:0x7f3ea04511c9
> [  139.789873] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 
> f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 
> 01 f0 ff ff 73 01 c3 48 8b 0d 97 dc 2c 00 f7 d8 64 89 01 48
> [  139.808620] RSP: 002b:7f3ea0357e28 EFLAGS: 0246 ORIG_RAX: 
> 0150
> [  139.816185] RAX: ffda RBX: 0001 RCX: 
> 7f3ea04511c9
> [  139.823317] RDX: 0001 RSI: 0001 RDI: 
> 7f3ea0d63000
> [  139.830448] RBP:  R08: 0003 R09: 
> 0003
> [  139.837581] R10: 0003 R11: 0246 R12: 
> 025ea080
> [  139.844714] R13: 025ea000 R14: 00602120 R15: 
> 0001
> [  139.851848] Disabling lock debugging due to kernel taint
> 
> Followed by this:
> 
> [  163.724474] watchdog: BUG: soft lockup - CPU#46 stuck for 22s! 
> [aio-ring:13137]
> [  163.731786] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE 
> iptable_nat nf_nat_ipv4 nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 
> nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc devlink 
> ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter sunrpc 
> dm_mirror dm_region_hash dm_log dm_

[GIT PULL] Block fixes for 4.20-rc6

2018-12-07 Thread Jens Axboe

Hi Linus,

Let's try this again... We're finally happy with the DM livelock issue,
and it's also passed overnight testing and the corruption regression
test. The end result is much nicer now too, which is great. Outside of
that fix, there's a pull request for NVMe with two small fixes, and a
regression fix for BFQ from this merge window. The BFQ fix looks bigger
than it is, it's 90% comment updates.

Please pull!


  git://git.kernel.dk/linux-block.git tags/for-linus-20181207



Israel Rukshin (1):
  nvmet-rdma: fix response use after free

James Smart (1):
  nvme: validate controller state before rescheduling keep alive

Jens Axboe (2):
  blk-mq: punt failed direct issue to dispatch list
  Merge branch 'nvme-4.20' of git://git.infradead.org/nvme into for-linus

Paolo Valente (1):
  block, bfq: fix decrement of num_active_groups

 block/bfq-iosched.c| 76 --
 block/bfq-iosched.h| 51 +--
 block/bfq-wf2q.c   |  5 ++-
 block/blk-mq.c | 33 +++-
 drivers/nvme/host/core.c   | 10 +-
 drivers/nvme/target/rdma.c |  3 +-
 6 files changed, 123 insertions(+), 55 deletions(-)

-- 
Jens Axboe

Re: [PATCH v3] blk-mq: punt failed direct issue to dispatch list

2018-12-07 Thread Jens Axboe

On 12/7/18 9:41 AM, Bart Van Assche wrote:
> On Fri, 2018-12-07 at 09:35 -0700, Jens Axboe wrote:
>> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
>> index 29bfe8017a2d..9e5bda8800f8 100644
>> --- a/block/blk-mq-sched.c
>> +++ b/block/blk-mq-sched.c
>> @@ -377,6 +377,16 @@ void blk_mq_sched_insert_request(struct request *rq, 
>> bool at_head,
>>  
>>  WARN_ON(e && (rq->tag != -1));
>>  
>> +/*
>> + * It's illegal to insert a request into the scheduler that has
>> + * been through ->queue_rq(). Warn for that case, and use a bypass
>> + * insert to be safe.
>> + */
> 
> Shouldn't this refer to requests that have been prepared instead of requests
> that have been through ->queue_rq()? I think this function is called for
> requests that are requeued. Requeued requests have been through ->queue_rq()
> but are unprepared before being requeued.

If they are unprepared, RQF_DONTPREP should have been cleared. But needs
testing and verification, which is exactly why I didn't want to bundle with
the fix.

I'll test it later today.

-- 
Jens Axboe

Re: [PATCH v3] blk-mq: punt failed direct issue to dispatch list

2018-12-07 Thread Jens Axboe

On 12/7/18 9:24 AM, Jens Axboe wrote:
> On 12/7/18 9:19 AM, Bart Van Assche wrote:
>> On Thu, 2018-12-06 at 22:17 -0700, Jens Axboe wrote:
>>> Instead of making special cases for what we can direct issue, and now
>>> having to deal with DM solving the livelock while still retaining a BUSY
>>> condition feedback loop, always just add a request that has been through
>>> ->queue_rq() to the hardware queue dispatch list. These are safe to use
>>> as no merging can take place there. Additionally, if requests do have
>>> prepped data from drivers, we aren't dependent on them not sharing space
>>> in the request structure to safely add them to the IO scheduler lists.
>>
>> How about making blk_mq_sched_insert_request() complain if a request is 
>> passed
>> to it in which the RQF_DONTPREP flag has been set to avoid that this problem 
>> is
>> reintroduced in the future? Otherwise this patch looks fine to me.
> 
> I agree, but I think we should do that as a follow up patch. I don't want to
> touch this one if we can avoid it. The thought did cross my mind, too. It
> should be impossible now that everything goes to the dispatch list.

Something like the below.


diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 29bfe8017a2d..9e5bda8800f8 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -377,6 +377,16 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
 
WARN_ON(e && (rq->tag != -1));
 
+   /*
+* It's illegal to insert a request into the scheduler that has
+* been through ->queue_rq(). Warn for that case, and use a bypass
+* insert to be safe.
+*/
+   if (WARN_ON_ONCE(rq->rq_flags & RQF_DONTPREP)) {
+   blk_mq_request_bypass_insert(rq, false);
+   goto run;
+   }
+
if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
goto run;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6a7566244de3..d5f890d5c814 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1595,15 +1595,25 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, 
struct blk_mq_ctx *ctx,
struct list_head *list)
 
 {
-   struct request *rq;
+   struct request *rq, *tmp;
 
/*
 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 * offline now
 */
-   list_for_each_entry(rq, list, queuelist) {
+   list_for_each_entry_safe(rq, tmp, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
trace_block_rq_insert(hctx->queue, rq);
+
+   /*
+* It's illegal to insert a request into the scheduler that has
+* been through ->queue_rq(). Warn for that case, and use a
+* bypass insert to be safe.
+*/
+   if (WARN_ON_ONCE(rq->rq_flags & RQF_DONTPREP)) {
+   list_del_init(>queuelist);
+   blk_mq_request_bypass_insert(rq, false);
+   }
}
 
spin_lock(>lock);

-- 
Jens Axboe

Re: [PATCH v3] blk-mq: punt failed direct issue to dispatch list

2018-12-07 Thread Jens Axboe

On 12/7/18 9:19 AM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 22:17 -0700, Jens Axboe wrote:
>> Instead of making special cases for what we can direct issue, and now
>> having to deal with DM solving the livelock while still retaining a BUSY
>> condition feedback loop, always just add a request that has been through
>> ->queue_rq() to the hardware queue dispatch list. These are safe to use
>> as no merging can take place there. Additionally, if requests do have
>> prepped data from drivers, we aren't dependent on them not sharing space
>> in the request structure to safely add them to the IO scheduler lists.
> 
> How about making blk_mq_sched_insert_request() complain if a request is passed
> to it in which the RQF_DONTPREP flag has been set to avoid that this problem 
> is
> reintroduced in the future? Otherwise this patch looks fine to me.

I agree, but I think we should do that as a follow up patch. I don't want to
touch this one if we can avoid it. The thought did cross my mind, too. It
should be impossible now that everything goes to the dispatch list.

-- 
Jens Axboe

Re: [GIT PULL] nvme fixes for 4.20

2018-12-07 Thread Jens Axboe

On 12/7/18 8:38 AM, Christoph Hellwig wrote:
> The following changes since commit ba7aeae5539c7a74cf07a2bc61281a93c50e:
> 
>   block, bfq: fix decrement of num_active_groups (2018-12-07 07:40:07 -0700)
> 
> are available in the Git repository at:
> 
>   git://git.infradead.org/nvme.git nvme-4.20
> 
> for you to fetch changes up to d7dcdf9d4e15189ecfda24cc87339a3425448d5c:
> 
>   nvmet-rdma: fix response use after free (2018-12-07 07:11:11 -0800)
> 
> 
> Israel Rukshin (1):
>   nvmet-rdma: fix response use after free
> 
> James Smart (1):
>   nvme: validate controller state before rescheduling keep alive
> 
>  drivers/nvme/host/core.c   | 10 +-
>  drivers/nvme/target/rdma.c |  3 ++-
>  2 files changed, 11 insertions(+), 2 deletions(-)

Pulled, thanks.

-- 
Jens Axboe

Re: [PATCH V2] blk-mq: re-build queue map in case of kdump kernel

2018-12-06 Thread Jens Axboe

On 12/6/18 8:03 PM, Ming Lei wrote:
> Now almost all .map_queues() implementation based on managed irq
> affinity doesn't update queue mapping and it just retrieves the
> old built mapping, so if nr_hw_queues is changed, the mapping talbe
> includes stale mapping. And only blk_mq_map_queues() may rebuild
> the mapping talbe.
> 
> One case is that we limit .nr_hw_queues as 1 in case of kdump kernel.
> However, drivers often builds queue mapping before allocating tagset
> via pci_alloc_irq_vectors_affinity(), but set->nr_hw_queues can be set
> as 1 in case of kdump kernel, so wrong queue mapping is used, and
> kernel panic[1] is observed during booting.
> 
> This patch fixes the kernel panic triggerd on nvme by rebulding the
> mapping table via blk_mq_map_queues().
> 
> [1] kernel panic log
> [4.438371] nvme nvme0: 16/0/0 default/read/poll queues
> [4.443277] BUG: unable to handle kernel NULL pointer dereference at 
> 0098
> [4.444681] PGD 0 P4D 0
> [4.445367] Oops:  [#1] SMP NOPTI
> [4.446342] CPU: 3 PID: 201 Comm: kworker/u33:10 Not tainted 
> 4.20.0-rc5-00664-g5eb02f7ee1eb-dirty #459
> [4.447630] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.10.2-2.fc27 04/01/2014
> [4.448689] Workqueue: nvme-wq nvme_scan_work [nvme_core]
> [4.449368] RIP: 0010:blk_mq_map_swqueue+0xfb/0x222
> [4.450596] Code: 04 f5 20 28 ef 81 48 89 c6 39 55 30 76 93 89 d0 48 c1 e0 
> 04 48 03 83 f8 05 00 00 48 8b 00 42 8b 3c 28 48 8b 43 58 48 8b 04 f8 <48> 8b 
> b8 98 00 00 00 4c 0f a3 37 72 42 f0 4c 0f ab 37 66 8b b8 f6
> [4.453132] RSP: 0018:c900023b3cd8 EFLAGS: 00010286
> [4.454061] RAX:  RBX: 888174448000 RCX: 
> 0001
> [4.456480] RDX: 0001 RSI: e8feffc506c0 RDI: 
> 0001
> [4.458750] RBP: 88810722d008 R08: 88817647a880 R09: 
> 0002
> [4.464580] R10: c900023b3c10 R11: 0004 R12: 
> 888174448538
> [4.467803] R13: 0004 R14: 0001 R15: 
> 0001
> [4.469220] FS:  () GS:88817bac() 
> knlGS:
> [4.471554] CS:  0010 DS:  ES:  CR0: 80050033
> [4.472464] CR2: 0098 CR3: 000174e4e001 CR4: 
> 00760ee0
> [4.474264] DR0:  DR1:  DR2: 
> 
> [4.476007] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [4.477061] PKRU: 5554
> [4.477464] Call Trace:
> [4.478731]  blk_mq_init_allocated_queue+0x36a/0x3ad
> [4.479595]  blk_mq_init_queue+0x32/0x4e
> [4.480178]  nvme_validate_ns+0x98/0x623 [nvme_core]
> [4.480963]  ? nvme_submit_sync_cmd+0x1b/0x20 [nvme_core]
> [4.481685]  ? nvme_identify_ctrl.isra.8+0x70/0xa0 [nvme_core]
> [4.482601]  nvme_scan_work+0x23a/0x29b [nvme_core]
> [4.483269]  ? _raw_spin_unlock_irqrestore+0x25/0x38
> [4.483930]  ? try_to_wake_up+0x38d/0x3b3
> [4.484478]  ? process_one_work+0x179/0x2fc
> [4.485118]  process_one_work+0x1d3/0x2fc
> [4.485655]  ? rescuer_thread+0x2ae/0x2ae
> [4.486196]  worker_thread+0x1e9/0x2be
> [4.486841]  kthread+0x115/0x11d
> [4.487294]  ? kthread_park+0x76/0x76
> [4.487784]  ret_from_fork+0x3a/0x50
> [4.488322] Modules linked in: nvme nvme_core qemu_fw_cfg virtio_scsi 
> ip_tables
> [4.489428] Dumping ftrace buffer:
> [4.489939](ftrace buffer empty)
> [4.490492] CR2: 0098
> [4.491052] ---[ end trace 03cd268ad5a86ff7 ]---

Works for me, tested various configs and stubbed out the kdump check.
Thanks for fixing this, applied to 4.21.

-- 
Jens Axboe

[PATCH v3] blk-mq: punt failed direct issue to dispatch list

2018-12-06 Thread Jens Axboe

After the direct dispatch corruption fix, we permanently disallow direct
dispatch of non read/write requests. This works fine off the normal IO
path, as they will be retried like any other failed direct dispatch
request. But for the blk_insert_cloned_request() that only DM uses to
bypass the bottom level scheduler, we always first attempt direct
dispatch. For some types of requests, that's now a permanent failure,
and no amount of retrying will make that succeed. This results in a
livelock.

Instead of making special cases for what we can direct issue, and now
having to deal with DM solving the livelock while still retaining a BUSY
condition feedback loop, always just add a request that has been through
->queue_rq() to the hardware queue dispatch list. These are safe to use
as no merging can take place there. Additionally, if requests do have
prepped data from drivers, we aren't dependent on them not sharing space
in the request structure to safely add them to the IO scheduler lists.

This basically reverts ffe81d45322c and is based on a patch from Ming,
but with the list insert case covered as well.

Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
Cc: sta...@vger.kernel.org
Suggested-by: Ming Lei 
Reported-by: Bart Van Assche 
Signed-off-by: Jens Axboe 

---

I've thrown the initial hang test reported by Bart at it, works fine.
My reproducer for the corruption case is also happy, as expected.

I'm running blktests and xfstests on it overnight. If that passes as
expected, this qualms my initial worries on using ->dispatch as a
holding place for these types of requests.


diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3262d83b9e07..6a7566244de3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1715,15 +1715,6 @@ static blk_status_t __blk_mq_issue_directly(struct 
blk_mq_hw_ctx *hctx,
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
-   /*
-* If direct dispatch fails, we cannot allow any merging on
-* this IO. Drivers (like SCSI) may have set up permanent state
-* for this request, like SG tables and mappings, and if we
-* merge to it later on then we'll still only do IO to the
-* original part.
-*/
-   rq->cmd_flags |= REQ_NOMERGE;
-
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
@@ -1736,18 +1727,6 @@ static blk_status_t __blk_mq_issue_directly(struct 
blk_mq_hw_ctx *hctx,
return ret;
 }
 
-/*
- * Don't allow direct dispatch of anything but regular reads/writes,
- * as some of the other commands can potentially share request space
- * with data we need for the IO scheduler. If we attempt a direct dispatch
- * on those and fail, we can't safely add it to the scheduler afterwards
- * without potentially overwriting data that the driver has already written.
- */
-static bool blk_rq_can_direct_dispatch(struct request *rq)
-{
-   return req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE;
-}
-
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
@@ -1769,7 +1748,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct 
blk_mq_hw_ctx *hctx,
goto insert;
}
 
-   if (!blk_rq_can_direct_dispatch(rq) || (q->elevator && !bypass_insert))
+   if (q->elevator && !bypass_insert)
goto insert;
 
if (!blk_mq_get_dispatch_budget(hctx))
@@ -1785,7 +1764,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct 
blk_mq_hw_ctx *hctx,
if (bypass_insert)
return BLK_STS_RESOURCE;
 
-   blk_mq_sched_insert_request(rq, false, run_queue, false);
+   blk_mq_request_bypass_insert(rq, run_queue);
return BLK_STS_OK;
 }
 
@@ -1801,7 +1780,7 @@ static void blk_mq_try_issue_directly(struct 
blk_mq_hw_ctx *hctx,
 
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-   blk_mq_sched_insert_request(rq, false, true, false);
+   blk_mq_request_bypass_insert(rq, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
 
@@ -1831,15 +1810,13 @@ void blk_mq_try_issue_list_directly(struct 
blk_mq_hw_ctx *hctx,
struct request *rq = list_first_entry(list, struct request,
queuelist);
 
-   if (!blk_rq_can_direct_dispatch(rq))
-   break;
-
list_del_init(>queuelist);
ret = blk_mq_request_issue_directly(rq);
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||

Re: [PATCH v2] block/dm: fix handling of busy off direct dispatch path

2018-12-06 Thread Jens Axboe

On 12/6/18 9:18 PM, Mike Snitzer wrote:
> On Thu, Dec 06 2018 at 11:06pm -0500,
> Jens Axboe  wrote:
> 
>> On 12/6/18 8:54 PM, Mike Snitzer wrote:
>>> On Thu, Dec 06 2018 at  9:49pm -0500,
>>> Jens Axboe  wrote:
>>>
>>>> After the direct dispatch corruption fix, we permanently disallow direct
>>>> dispatch of non read/write requests. This works fine off the normal IO
>>>> path, as they will be retried like any other failed direct dispatch
>>>> request. But for the blk_insert_cloned_request() that only DM uses to
>>>> bypass the bottom level scheduler, we always first attempt direct
>>>> dispatch. For some types of requests, that's now a permanent failure,
>>>> and no amount of retrying will make that succeed.
>>>>
>>>> Use the driver private RQF_DONTPREP to track this condition in DM. If
>>>> we encounter a BUSY condition from blk_insert_cloned_request(), then
>>>> flag the request with RQF_DONTPREP. When we next time see this request,
>>>> ask blk_insert_cloned_request() to bypass insert the request directly.
>>>> This avoids the livelock of repeatedly trying to direct dispatch a
>>>> request, while still retaining the BUSY feedback loop for blk-mq so
>>>> that we don't over-dispatch to the lower level queue and mess up
>>>> opportunities for merging on the DM queue.
>>>>
>>>> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
>>>> Reported-by: Bart Van Assche 
>>>> Cc: sta...@vger.kernel.org
>>>> Signed-off-by: Jens Axboe 
>>>>
>>>> ---
>>>>
>>>> This passes my testing as well, like the previous patch. But unlike the
>>>> previous patch, we retain the BUSY feedback loop information for better
>>>> merging.
>>>
>>> But it is kind of gross to workaround the new behaviour to "permanently
>>> disallow direct dispatch of non read/write requests" by always failing
>>> such requests back to DM for later immediate direct dispatch.  That
>>> bouncing of the request was acceptable when there was load-based
>>> justification for having to retry (and in doing so: taking the cost of
>>> freeing the clone request gotten via get_request() from the underlying
>>> request_queues).
>>>
>>> Having to retry like this purely because the request isn't a read or
>>> write seems costly.. every non-read-write will have implied
>>> request_queue bouncing.  In multipath's case: it could select an
>>> entirely different underlying path the next time it is destaged (with
>>> RQF_DONTPREP set).  Which you'd think would negate all hope of IO
>>> merging based performance improvements -- but that is a tangent I'll
>>> need to ask Ming about (again).
>>>
>>> I really don't like this business of bouncing requests as a workaround
>>> for the recent implementation of the corruption fix.
>>>
>>> Why not just add an override flag to _really_ allow direct dispatch for
>>> _all_ types of requests?
>>>
>>> (just peeked at linux-block and it is looking like you took 
>>> jianchao.wang's series to avoid this hack... ;)
>>>
>>> Awesome.. my work is done for tonight!
>>
>> The whole point is doing something that is palatable to 4.20 and leaving
>> the more experimental stuff to 4.21, where we have some weeks to verify
>> that there are no conditions that cause IO stalls. I don't envision there
>> will be, but I'm not willing to risk it this late in the 4.20 cycle.
>>
>> That said, this isn't a quick and dirty and I don't think it's fair
>> calling this a hack. Using RQF_DONTPREP is quite common in drivers to
>> retain state over multiple ->queue_rq invocations. Using it to avoid
>> multiple direct dispatch failures (and obviously this new livelock)
>> seems fine to me.
> 
> But it bounces IO purely because non-read-write.  That results in
> guaranteed multiple blk_get_request() -- from underlying request_queues
> request-based DM is stacked on -- for every non-read-write IO that is
> cloned.  That seems pathological.  I must still be missing something.
> 
>> I really don't want to go around and audit every driver for potential
>> retained state over special commands, that's why the read+write thing is
>> in place. It's the safe option, which is what we need right now.
> 
> Maybe leave blk_mq_request_issue_directly() interface how it is,
> non-read-write restriction and all, but export a new
> __blk

Re: [PATCH v2] block/dm: fix handling of busy off direct dispatch path

2018-12-06 Thread Jens Axboe

On 12/6/18 8:54 PM, Mike Snitzer wrote:
> On Thu, Dec 06 2018 at  9:49pm -0500,
> Jens Axboe  wrote:
> 
>> After the direct dispatch corruption fix, we permanently disallow direct
>> dispatch of non read/write requests. This works fine off the normal IO
>> path, as they will be retried like any other failed direct dispatch
>> request. But for the blk_insert_cloned_request() that only DM uses to
>> bypass the bottom level scheduler, we always first attempt direct
>> dispatch. For some types of requests, that's now a permanent failure,
>> and no amount of retrying will make that succeed.
>>
>> Use the driver private RQF_DONTPREP to track this condition in DM. If
>> we encounter a BUSY condition from blk_insert_cloned_request(), then
>> flag the request with RQF_DONTPREP. When we next time see this request,
>> ask blk_insert_cloned_request() to bypass insert the request directly.
>> This avoids the livelock of repeatedly trying to direct dispatch a
>> request, while still retaining the BUSY feedback loop for blk-mq so
>> that we don't over-dispatch to the lower level queue and mess up
>> opportunities for merging on the DM queue.
>>
>> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
>> Reported-by: Bart Van Assche 
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Jens Axboe 
>>
>> ---
>>
>> This passes my testing as well, like the previous patch. But unlike the
>> previous patch, we retain the BUSY feedback loop information for better
>> merging.
> 
> But it is kind of gross to workaround the new behaviour to "permanently
> disallow direct dispatch of non read/write requests" by always failing
> such requests back to DM for later immediate direct dispatch.  That
> bouncing of the request was acceptable when there was load-based
> justification for having to retry (and in doing so: taking the cost of
> freeing the clone request gotten via get_request() from the underlying
> request_queues).
> 
> Having to retry like this purely because the request isn't a read or
> write seems costly.. every non-read-write will have implied
> request_queue bouncing.  In multipath's case: it could select an
> entirely different underlying path the next time it is destaged (with
> RQF_DONTPREP set).  Which you'd think would negate all hope of IO
> merging based performance improvements -- but that is a tangent I'll
> need to ask Ming about (again).
> 
> I really don't like this business of bouncing requests as a workaround
> for the recent implementation of the corruption fix.
> 
> Why not just add an override flag to _really_ allow direct dispatch for
> _all_ types of requests?
> 
> (just peeked at linux-block and it is looking like you took 
> jianchao.wang's series to avoid this hack... ;)
> 
> Awesome.. my work is done for tonight!

The whole point is doing something that is palatable to 4.20 and leaving
the more experimental stuff to 4.21, where we have some weeks to verify
that there are no conditions that cause IO stalls. I don't envision there
will be, but I'm not willing to risk it this late in the 4.20 cycle.

That said, this isn't a quick and dirty and I don't think it's fair
calling this a hack. Using RQF_DONTPREP is quite common in drivers to
retain state over multiple ->queue_rq invocations. Using it to avoid
multiple direct dispatch failures (and obviously this new livelock)
seems fine to me.

I really don't want to go around and audit every driver for potential
retained state over special commands, that's why the read+write thing is
in place. It's the safe option, which is what we need right now.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-06 Thread Jens Axboe

On 12/6/18 7:46 PM, Theodore Y. Ts'o wrote:
> On Wed, Dec 05, 2018 at 11:03:01AM +0800, Ming Lei wrote:
>>
>> But at that time, there isn't io scheduler for MQ, so in theory the
>> issue should be there since v4.11, especially 945ffb60c11d ("mq-deadline:
>> add blk-mq adaptation of the deadline IO scheduler").
> 
> Hi Ming,
> 
> How were serious you about this issue being there (theoretically) an
> issue since 4.11?  Can you talk about how it might get triggered, and
> how we can test for it?  The reason why I ask is because we're trying
> to track down a mysterious file system corruption problem on a 4.14.x
> stable kernel.  The symptoms are *very* eerily similar to kernel
> bugzilla #201685.
> 
> The problem is that the problem is super-rare --- roughly once a week
> out of a popuation of about 2500 systems.  The workload is NFS
> serving.  Unfortunately, the problem is since 4.14.63, we can no
> longer disable blk-mq for the virtio-scsi driver, thanks to the commit
> b5b6e8c8d3b4 ("scsi: virtio_scsi: fix IO hang caused by automatic irq
> vector affinity") getting backported into 4.14.63 as commit
> 70b522f163bbb32.
> 
> We're considering reverting this patch in our 4.14 LTS kernel, and
> seeing whether it makes the problem go away.  Is there any thing else
> you might suggest?

We should just make SCSI do the right thing, which is to unprep if
it sees BUSY and prep next time again. Otherwise I fear the direct
dispatch isn't going to be super useful, if a failed direct dispatch
prevents future merging.

This would be a lot less error prone as well for other cases.

-- 
Jens Axboe

Re: [PATCH] blk-mq: re-build queue map in case of kdump kernel

2018-12-06 Thread Jens Axboe

On 12/6/18 7:55 PM, Ming Lei wrote:
> Now almost all .map_queues() implementation based on managed irq
> affinity doesn't update queue mapping and it just retrieves the
> old built mapping, so if nr_hw_queues is changed, the mapping talbe
> includes stale mapping. And only blk_mq_map_queues() may rebuild
> the mapping talbe.
> 
> One case is that we limit .nr_hw_queues as 1 in case of kdump kernel.
> However, drivers often builds queue mapping before allocating tagset
> via pci_alloc_irq_vectors_affinity(), but set->nr_hw_queues can be set
> as 1 in case of kdump kernel, so wrong queue mapping is used, and
> kernel panic[1] is observed during booting.
> 
> This patch fixes the kernel panic triggerd on nvme by rebulding the
> mapping table via blk_mq_map_queues().
> 
> [1] kernel panic log
> [4.438371] nvme nvme0: 16/0/0 default/read/poll queues
> [4.443277] BUG: unable to handle kernel NULL pointer dereference at 
> 0098
> [4.444681] PGD 0 P4D 0
> [4.445367] Oops:  [#1] SMP NOPTI
> [4.446342] CPU: 3 PID: 201 Comm: kworker/u33:10 Not tainted 
> 4.20.0-rc5-00664-g5eb02f7ee1eb-dirty #459
> [4.447630] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.10.2-2.fc27 04/01/2014
> [4.448689] Workqueue: nvme-wq nvme_scan_work [nvme_core]
> [4.449368] RIP: 0010:blk_mq_map_swqueue+0xfb/0x222
> [4.450596] Code: 04 f5 20 28 ef 81 48 89 c6 39 55 30 76 93 89 d0 48 c1 e0 
> 04 48 03 83 f8 05 00 00 48 8b 00 42 8b 3c 28 48 8b 43 58 48 8b 04 f8 <48> 8b 
> b8 98 00 00 00 4c 0f a3 37 72 42 f0 4c 0f ab 37 66 8b b8 f6
> [4.453132] RSP: 0018:c900023b3cd8 EFLAGS: 00010286
> [4.454061] RAX:  RBX: 888174448000 RCX: 
> 0001
> [4.456480] RDX: 0001 RSI: e8feffc506c0 RDI: 
> 0001
> [4.458750] RBP: 88810722d008 R08: 88817647a880 R09: 
> 0002
> [4.464580] R10: c900023b3c10 R11: 0004 R12: 
> 888174448538
> [4.467803] R13: 0004 R14: 0001 R15: 
> 0001
> [4.469220] FS:  () GS:88817bac() 
> knlGS:
> [4.471554] CS:  0010 DS:  ES:  CR0: 80050033
> [4.472464] CR2: 0098 CR3: 000174e4e001 CR4: 
> 00760ee0
> [4.474264] DR0:  DR1:  DR2: 
> 
> [4.476007] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [4.477061] PKRU: 5554
> [4.477464] Call Trace:
> [4.478731]  blk_mq_init_allocated_queue+0x36a/0x3ad
> [4.479595]  blk_mq_init_queue+0x32/0x4e
> [4.480178]  nvme_validate_ns+0x98/0x623 [nvme_core]
> [4.480963]  ? nvme_submit_sync_cmd+0x1b/0x20 [nvme_core]
> [4.481685]  ? nvme_identify_ctrl.isra.8+0x70/0xa0 [nvme_core]
> [4.482601]  nvme_scan_work+0x23a/0x29b [nvme_core]
> [4.483269]  ? _raw_spin_unlock_irqrestore+0x25/0x38
> [4.483930]  ? try_to_wake_up+0x38d/0x3b3
> [4.484478]  ? process_one_work+0x179/0x2fc
> [4.485118]  process_one_work+0x1d3/0x2fc
> [4.485655]  ? rescuer_thread+0x2ae/0x2ae
> [4.486196]  worker_thread+0x1e9/0x2be
> [4.486841]  kthread+0x115/0x11d
> [4.487294]  ? kthread_park+0x76/0x76
> [4.487784]  ret_from_fork+0x3a/0x50
> [4.488322] Modules linked in: nvme nvme_core qemu_fw_cfg virtio_scsi 
> ip_tables
> [4.489428] Dumping ftrace buffer:
> [4.489939](ftrace buffer empty)
> [4.490492] CR2: 0098
> [4.491052] ---[ end trace 03cd268ad5a86ff7 ]---
> 
> Cc: Christoph Hellwig 
> Cc: linux-n...@lists.infradead.org
> Cc: David Milburn 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-mq.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 900550594651..a3e463a726a6 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -38,6 +38,11 @@
>  #include "blk-mq-sched.h"
>  #include "blk-rq-qos.h"
>  
> +static inline bool blk_mq_kdump_kernel(void)
> +{
> + return !!is_kdump_kernel();
> +}

Let's drop the redundant !! here, and the wrapper? I would imagine the
wrapper is handy for testing outside of kdump, but I don't think we
should include it in the final.

Otherwise this looks fine, I can test it here too.

-- 
Jens Axboe

[PATCH v2] block/dm: fix handling of busy off direct dispatch path

2018-12-06 Thread Jens Axboe

After the direct dispatch corruption fix, we permanently disallow direct
dispatch of non read/write requests. This works fine off the normal IO
path, as they will be retried like any other failed direct dispatch
request. But for the blk_insert_cloned_request() that only DM uses to
bypass the bottom level scheduler, we always first attempt direct
dispatch. For some types of requests, that's now a permanent failure,
and no amount of retrying will make that succeed.

Use the driver private RQF_DONTPREP to track this condition in DM. If
we encounter a BUSY condition from blk_insert_cloned_request(), then
flag the request with RQF_DONTPREP. When we next time see this request,
ask blk_insert_cloned_request() to bypass insert the request directly.
This avoids the livelock of repeatedly trying to direct dispatch a
request, while still retaining the BUSY feedback loop for blk-mq so
that we don't over-dispatch to the lower level queue and mess up
opportunities for merging on the DM queue.

Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
Reported-by: Bart Van Assche 
Cc: sta...@vger.kernel.org
Signed-off-by: Jens Axboe 

---

This passes my testing as well, like the previous patch. But unlike the
previous patch, we retain the BUSY feedback loop information for better
merging.

diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..cccda51e165f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2617,7 +2617,8 @@ static int blk_cloned_rq_check_limits(struct 
request_queue *q,
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq)
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq,
+  bool force_insert)
 {
unsigned long flags;
int where = ELEVATOR_INSERT_BACK;
@@ -2637,7 +2638,11 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   return blk_mq_request_issue_directly(rq);
+   if (force_insert) {
+   blk_mq_request_bypass_insert(rq, true);
+   return BLK_STS_OK;
+   } else
+   return blk_mq_request_issue_directly(rq);
}
 
spin_lock_irqsave(q->queue_lock, flags);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..e497a2ab6766 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -299,16 +299,20 @@ static void end_clone_request(struct request *clone, 
blk_status_t error)
 
 static blk_status_t dm_dispatch_clone_request(struct request *clone, struct 
request *rq)
 {
+   bool was_busy = (rq->rq_flags & RQF_DONTPREP) != 0;
blk_status_t r;
 
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
 
clone->start_time_ns = ktime_get_ns();
-   r = blk_insert_cloned_request(clone->q, clone);
-   if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != 
BLK_STS_DEV_RESOURCE)
+   r = blk_insert_cloned_request(clone->q, clone, was_busy);
+   if (r == BLK_STS_RESOURCE || r == BLK_STS_DEV_RESOURCE)
+   rq->rq_flags |= RQF_DONTPREP;
+   else if (r != BLK_STS_OK)
/* must complete clone in terms of original request */
dm_complete_request(rq, r);
+
return r;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..7cb84ee4c9f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -994,7 +994,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct 
request *rq_src,
 void *data);
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-struct request *rq);
+struct request *rq, bool force_insert);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);

-- 
Jens Axboe

Re: block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 7:16 PM, Jens Axboe wrote:
> On 12/6/18 7:06 PM, Jens Axboe wrote:
>> On 12/6/18 6:58 PM, Mike Snitzer wrote:
>>>> There is another way to fix this - still do the direct dispatch, but have
>>>> dm track if it failed and do bypass insert in that case. I didn't want do
>>>> to that since it's more involved, but it's doable.
>>>>
>>>> Let me cook that up and test it... Don't like it, though.
>>>
>>> Not following how DM can track if issuing the request worked if it is
>>> always told it worked with BLK_STS_OK.  We care about feedback when the
>>> request is actually issued because of the elaborate way blk-mq elevators
>>> work.  DM is forced to worry about all these details, as covered some in
>>> the header for commit 396eaf21ee17c476e8f66249fb1f4a39003d0ab4, it is
>>> trying to have its cake and eat it too.  It just wants IO scheduling to
>>> work for request-based DM devices.  That's it.
>>
>> It needs the feedback, I don't disagree on that part at all. If we
>> always return OK, then that loop is broken. How about something like the
>> below? Totally untested right now...
>>
>> We track if a request ever saw BLK_STS_RESOURCE from direct dispatch,
>> and if it did, we store that information with RQF_DONTPREP. When we then
>> next time go an insert a request, if it has RQF_DONTPREP set, then we
>> ask blk_insert_cloned_request() to bypass insert.
>>
>> I'll go test this now.
> 
> Passes the test case for me.

Here's one that doesn't re-arrange the return value check into a switch.
Turns out cleaner (and less LOC changes), and also doesn't fiddle with
request after freeing it if we got OK return...

Will give this a whirl too, just in case.

diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..cccda51e165f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2617,7 +2617,8 @@ static int blk_cloned_rq_check_limits(struct 
request_queue *q,
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq)
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq,
+  bool force_insert)
 {
unsigned long flags;
int where = ELEVATOR_INSERT_BACK;
@@ -2637,7 +2638,11 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   return blk_mq_request_issue_directly(rq);
+   if (force_insert) {
+   blk_mq_request_bypass_insert(rq, true);
+   return BLK_STS_OK;
+   } else
+   return blk_mq_request_issue_directly(rq);
}
 
spin_lock_irqsave(q->queue_lock, flags);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..e497a2ab6766 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -299,16 +299,20 @@ static void end_clone_request(struct request *clone, 
blk_status_t error)
 
 static blk_status_t dm_dispatch_clone_request(struct request *clone, struct 
request *rq)
 {
+   bool was_busy = (rq->rq_flags & RQF_DONTPREP) != 0;
blk_status_t r;
 
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
 
clone->start_time_ns = ktime_get_ns();
-   r = blk_insert_cloned_request(clone->q, clone);
-   if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != 
BLK_STS_DEV_RESOURCE)
+   r = blk_insert_cloned_request(clone->q, clone, was_busy);
+   if (r == BLK_STS_RESOURCE || r == BLK_STS_DEV_RESOURCE)
+   rq->rq_flags |= RQF_DONTPREP;
+   else if (r != BLK_STS_OK)
/* must complete clone in terms of original request */
dm_complete_request(rq, r);
+
return r;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..7cb84ee4c9f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -994,7 +994,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct 
request *rq_src,
 void *data);
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-struct request *rq);
+    struct request *rq, bool force_insert);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);

-- 
Jens Axboe

Re: block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 7:06 PM, Jens Axboe wrote:
> On 12/6/18 6:58 PM, Mike Snitzer wrote:
>>> There is another way to fix this - still do the direct dispatch, but have
>>> dm track if it failed and do bypass insert in that case. I didn't want do
>>> to that since it's more involved, but it's doable.
>>>
>>> Let me cook that up and test it... Don't like it, though.
>>
>> Not following how DM can track if issuing the request worked if it is
>> always told it worked with BLK_STS_OK.  We care about feedback when the
>> request is actually issued because of the elaborate way blk-mq elevators
>> work.  DM is forced to worry about all these details, as covered some in
>> the header for commit 396eaf21ee17c476e8f66249fb1f4a39003d0ab4, it is
>> trying to have its cake and eat it too.  It just wants IO scheduling to
>> work for request-based DM devices.  That's it.
> 
> It needs the feedback, I don't disagree on that part at all. If we
> always return OK, then that loop is broken. How about something like the
> below? Totally untested right now...
> 
> We track if a request ever saw BLK_STS_RESOURCE from direct dispatch,
> and if it did, we store that information with RQF_DONTPREP. When we then
> next time go an insert a request, if it has RQF_DONTPREP set, then we
> ask blk_insert_cloned_request() to bypass insert.
> 
> I'll go test this now.

Passes the test case for me.

-- 
Jens Axboe

Re: [GIT PULL] Follow up block fix

2018-12-06 Thread Jens Axboe

On 12/6/18 5:33 PM, Jens Axboe wrote:
> On 12/6/18 5:31 PM, Bart Van Assche wrote:
>> On Thu, 2018-12-06 at 17:26 -0700, Jens Axboe wrote:
>>> On 12/6/18 5:20 PM, Bart Van Assche wrote:
>>>> Which branch does that tag correspond to? Even after having run git fetch
>>>> --tags I can't find that tag in your repository.
>>>
>>> I pushed it before I sent the email, where are you looking?
>>>
>>> http://git.kernel.dk/cgit/linux-block/tag/?h=for-linus-20181206
>>
>> On my development system git has been configured to fetch from the following 
>> repository:
>> https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git/
>>
>> Should I fetch your branches from the git.kernel.dk repository instead?
> 
> Ah I see, I always push to git.kernel.dk, and that syncs to git.kernel.org
> only every hour or so. So either one will work, but you might be a bit
> behind if you use git.kernel.org.

Linus, I just know notice you are not on the CC for the discussion about
the patch. Don't pull this one yet. It'll solve the issue, but it'll also
mess up the BUSY feedback loop that DM relies on for good merging of
sequential IO. Testing a new patch now, will push it to you when folks
are happy and when my testing has completed.

-- 
Jens Axboe

Re: block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 6:58 PM, Mike Snitzer wrote:
>> There is another way to fix this - still do the direct dispatch, but have
>> dm track if it failed and do bypass insert in that case. I didn't want do
>> to that since it's more involved, but it's doable.
>>
>> Let me cook that up and test it... Don't like it, though.
> 
> Not following how DM can track if issuing the request worked if it is
> always told it worked with BLK_STS_OK.  We care about feedback when the
> request is actually issued because of the elaborate way blk-mq elevators
> work.  DM is forced to worry about all these details, as covered some in
> the header for commit 396eaf21ee17c476e8f66249fb1f4a39003d0ab4, it is
> trying to have its cake and eat it too.  It just wants IO scheduling to
> work for request-based DM devices.  That's it.

It needs the feedback, I don't disagree on that part at all. If we
always return OK, then that loop is broken. How about something like the
below? Totally untested right now...

We track if a request ever saw BLK_STS_RESOURCE from direct dispatch,
and if it did, we store that information with RQF_DONTPREP. When we then
next time go an insert a request, if it has RQF_DONTPREP set, then we
ask blk_insert_cloned_request() to bypass insert.

I'll go test this now.

diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..cccda51e165f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2617,7 +2617,8 @@ static int blk_cloned_rq_check_limits(struct 
request_queue *q,
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq)
+blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request 
*rq,
+  bool force_insert)
 {
unsigned long flags;
int where = ELEVATOR_INSERT_BACK;
@@ -2637,7 +2638,11 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   return blk_mq_request_issue_directly(rq);
+   if (force_insert) {
+   blk_mq_request_bypass_insert(rq, true);
+   return BLK_STS_OK;
+   } else
+   return blk_mq_request_issue_directly(rq);
}
 
spin_lock_irqsave(q->queue_lock, flags);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..8d4c5020ccaa 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -299,16 +299,27 @@ static void end_clone_request(struct request *clone, 
blk_status_t error)
 
 static blk_status_t dm_dispatch_clone_request(struct request *clone, struct 
request *rq)
 {
+   bool was_busy = (rq->rq_flags & RQF_DONTPREP) != 0;
blk_status_t r;
 
if (blk_queue_io_stat(clone->q))
clone->rq_flags |= RQF_IO_STAT;
 
clone->start_time_ns = ktime_get_ns();
-   r = blk_insert_cloned_request(clone->q, clone);
-   if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != 
BLK_STS_DEV_RESOURCE)
+
+   r = blk_insert_cloned_request(clone->q, clone, was_busy);
+   switch (r) {
+   default:
/* must complete clone in terms of original request */
dm_complete_request(rq, r);
+   /* fall through */
+   case BLK_STS_RESOURCE:
+   case BLK_STS_DEV_RESOURCE:
+   rq->rq_flags |= RQF_DONTPREP;
+   case BLK_STS_OK:
+   break;
+   }
+
return r;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..7cb84ee4c9f4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -994,7 +994,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct 
request *rq_src,
 void *data);
 extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
-struct request *rq);
+struct request *rq, bool force_insert);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);

-- 
Jens Axboe

Re: [PATCH] block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 6:22 PM, jianchao.wang wrote:
> 
> 
> On 12/7/18 9:13 AM, Jens Axboe wrote:
>> On 12/6/18 6:04 PM, jianchao.wang wrote:
>>>
>>>
>>> On 12/7/18 6:20 AM, Jens Axboe wrote:
>>>> After the direct dispatch corruption fix, we permanently disallow direct
>>>> dispatch of non read/write requests. This works fine off the normal IO
>>>> path, as they will be retried like any other failed direct dispatch
>>>> request. But for the blk_insert_cloned_request() that only DM uses to
>>>> bypass the bottom level scheduler, we always first attempt direct
>>>> dispatch. For some types of requests, that's now a permanent failure,
>>>> and no amount of retrying will make that succeed.
>>>>
>>>> Don't use direct dispatch off the cloned insert path, always just use
>>>> bypass inserts. This still bypasses the bottom level scheduler, which is
>>>> what DM wants.
>>>>
>>>> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
>>>> Signed-off-by: Jens Axboe 
>>>>
>>>> ---
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index deb56932f8c4..4c44e6fa0d08 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
>>>> request_queue *q, struct request *
>>>> * bypass a potential scheduler on the bottom device for
>>>> * insert.
>>>> */
>>>> -  return blk_mq_request_issue_directly(rq);
>>>> +  blk_mq_request_bypass_insert(rq, true);
>>>> +  return BLK_STS_OK;
>>>>}
>>>>  
>>>>spin_lock_irqsave(q->queue_lock, flags);
>>>>
>>> Not sure about this because it will break the merging promotion for request 
>>> based DM
>>> from Ming.
>>> 396eaf21ee17c476e8f66249fb1f4a39003d0ab4
>>> (blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request 
>>> feedback)
>>>
>>> We could use some other way to fix this.
>>
>> That really shouldn't matter as this is the cloned insert, merging should
>> have been done on the original request.
>>
>>
> Just quote some comments from the patch.
> 
> "
>But dm-rq currently can't get the underlying queue's
> dispatch feedback at all.  Without knowing whether a request was issued
> or not (e.g. due to underlying queue being busy) the dm-rq elevator will
> not be able to provide effective IO merging (as a side-effect of dm-rq
> currently blindly destaging a request from its elevator only to requeue
> it after a delay, which kills any opportunity for merging).  This
> obviously causes very bad sequential IO performance.
> ...
> With this, request-based DM's blk-mq sequential IO performance is vastly
> improved (as much as 3X in mpath/virtio-scsi testing)
> "
> 
> Using blk_mq_request_bypass_insert to replace the 
> blk_mq_request_issue_directly
> could be a fast method to fix the current issue. Maybe we could get the 
> merging
> promotion back after some time.

This really sucks, mostly because DM wants to have it both ways - not use
the bottom level IO scheduler, but still actually use it if it makes sense.

There is another way to fix this - still do the direct dispatch, but have
dm track if it failed and do bypass insert in that case. I didn't want do
to that since it's more involved, but it's doable.

Let me cook that up and test it... Don't like it, though.

-- 
Jens Axboe

Re: [PATCH] block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 6:04 PM, jianchao.wang wrote:
> 
> 
> On 12/7/18 6:20 AM, Jens Axboe wrote:
>> After the direct dispatch corruption fix, we permanently disallow direct
>> dispatch of non read/write requests. This works fine off the normal IO
>> path, as they will be retried like any other failed direct dispatch
>> request. But for the blk_insert_cloned_request() that only DM uses to
>> bypass the bottom level scheduler, we always first attempt direct
>> dispatch. For some types of requests, that's now a permanent failure,
>> and no amount of retrying will make that succeed.
>>
>> Don't use direct dispatch off the cloned insert path, always just use
>> bypass inserts. This still bypasses the bottom level scheduler, which is
>> what DM wants.
>>
>> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
>> Signed-off-by: Jens Axboe 
>>
>> ---
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index deb56932f8c4..4c44e6fa0d08 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
>> request_queue *q, struct request *
>>   * bypass a potential scheduler on the bottom device for
>>   * insert.
>>   */
>> -return blk_mq_request_issue_directly(rq);
>> +blk_mq_request_bypass_insert(rq, true);
>> +return BLK_STS_OK;
>>  }
>>  
>>  spin_lock_irqsave(q->queue_lock, flags);
>>
> Not sure about this because it will break the merging promotion for request 
> based DM
> from Ming.
> 396eaf21ee17c476e8f66249fb1f4a39003d0ab4
> (blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request 
> feedback)
> 
> We could use some other way to fix this.

That really shouldn't matter as this is the cloned insert, merging should
have been done on the original request.


-- 
Jens Axboe

Re: [GIT PULL] Follow up block fix

2018-12-06 Thread Jens Axboe

On 12/6/18 5:31 PM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 17:26 -0700, Jens Axboe wrote:
>> On 12/6/18 5:20 PM, Bart Van Assche wrote:
>>> Which branch does that tag correspond to? Even after having run git fetch
>>> --tags I can't find that tag in your repository.
>>
>> I pushed it before I sent the email, where are you looking?
>>
>> http://git.kernel.dk/cgit/linux-block/tag/?h=for-linus-20181206
> 
> On my development system git has been configured to fetch from the following 
> repository:
> https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git/
> 
> Should I fetch your branches from the git.kernel.dk repository instead?

Ah I see, I always push to git.kernel.dk, and that syncs to git.kernel.org
only every hour or so. So either one will work, but you might be a bit
behind if you use git.kernel.org.

-- 
Jens Axboe

Re: [GIT PULL] Follow up block fix

2018-12-06 Thread Jens Axboe

On 12/6/18 5:20 PM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 16:59 -0700, Jens Axboe wrote:
>> Just a single followup fix to the corruption fix from yesterday. We have
>> an exported interface that does direct dispatch, with DM being the sole
>> user of it. Change that to do bypass insert always instead of attempting
>> direct dispatch. This fixes a deadlock that can happen on DM.
>> Additionally, it gets rid of any exported user of direct dispatch, and
>> enables DM to drop any BUSY handling from using the interface.
>>
>> Please pull!
>>
>>
>>   git://git.kernel.dk/linux-block.git tags/for-linus-20181206
> 
> Hi Jens,
> 
> Which branch does that tag correspond to? Even after having run git fetch
> --tags I can't find that tag in your repository.

I pushed it before I sent the email, where are you looking?

http://git.kernel.dk/cgit/linux-block/tag/?h=for-linus-20181206

> Additionally, the patch on your for-linus branch is missing the "Cc:
> stable" and "Reported-by:" tags that you had promised to add. Did I
> look at the wrong branch?

Doh I did - Linus, I'm going to amend the commit with that info and push
it out again, jfyi. Now done.

-- 
Jens Axboe

[GIT PULL] Follow up block fix

2018-12-06 Thread Jens Axboe

Hi Linus,

Just a single followup fix to the corruption fix from yesterday. We have
an exported interface that does direct dispatch, with DM being the sole
user of it. Change that to do bypass insert always instead of attempting
direct dispatch. This fixes a deadlock that can happen on DM.
Additionally, it gets rid of any exported user of direct dispatch, and
enables DM to drop any BUSY handling from using the interface.

Please pull!


  git://git.kernel.dk/linux-block.git tags/for-linus-20181206



Jens Axboe (1):
  block: fix direct dispatch issue failure for clones

 block/blk-core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

-- 
Jens Axboe

Re: block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 3:28 PM, Mike Snitzer wrote:
> On Thu, Dec 06 2018 at  5:20pm -0500,
> Jens Axboe  wrote:
> 
>> After the direct dispatch corruption fix, we permanently disallow direct
>> dispatch of non read/write requests. This works fine off the normal IO
>> path, as they will be retried like any other failed direct dispatch
>> request. But for the blk_insert_cloned_request() that only DM uses to
>> bypass the bottom level scheduler, we always first attempt direct
>> dispatch. For some types of requests, that's now a permanent failure,
>> and no amount of retrying will make that succeed.
>>
>> Don't use direct dispatch off the cloned insert path, always just use
>> bypass inserts. This still bypasses the bottom level scheduler, which is
>> what DM wants.
>>
>> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
>> Signed-off-by: Jens Axboe 
>>
>> ---
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index deb56932f8c4..4c44e6fa0d08 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
>> request_queue *q, struct request *
>>   * bypass a potential scheduler on the bottom device for
>>   * insert.
>>   */
>> -return blk_mq_request_issue_directly(rq);
>> +blk_mq_request_bypass_insert(rq, true);
>> +return BLK_STS_OK;
>>  }
>>  
>>  spin_lock_irqsave(q->queue_lock, flags);
> 
> Not sure what this trailing spin_lock_irqsave(q->queue_lock, flags) is
> about.. but this looks good.

It's because it's against current -git, that is gone from the 4.21 branch.


> I'll cleanup dm-rq.c to do away with the
> extra STS_RESOURCE checks for its call to blk_insert_cloned_request()
> once this lands.
That is indeed a nice benefit, all source based failure cases can be
removed from the caller after this.

-- 
Jens Axboe

Re: [PATCH] block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

On 12/6/18 3:20 PM, Jens Axboe wrote:
> After the direct dispatch corruption fix, we permanently disallow direct
> dispatch of non read/write requests. This works fine off the normal IO
> path, as they will be retried like any other failed direct dispatch
> request. But for the blk_insert_cloned_request() that only DM uses to
> bypass the bottom level scheduler, we always first attempt direct
> dispatch. For some types of requests, that's now a permanent failure,
> and no amount of retrying will make that succeed.
> 
> Don't use direct dispatch off the cloned insert path, always just use
> bypass inserts. This still bypasses the bottom level scheduler, which is
> what DM wants.
> 
> Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
> Signed-off-by: Jens Axboe 

Bart, I'll add your reported-by here of course, and also a stable CC
since the original patch went into stable.

-- 
Jens Axboe

[PATCH] block: fix direct dispatch issue failure for clones

2018-12-06 Thread Jens Axboe

After the direct dispatch corruption fix, we permanently disallow direct
dispatch of non read/write requests. This works fine off the normal IO
path, as they will be retried like any other failed direct dispatch
request. But for the blk_insert_cloned_request() that only DM uses to
bypass the bottom level scheduler, we always first attempt direct
dispatch. For some types of requests, that's now a permanent failure,
and no amount of retrying will make that succeed.

Don't use direct dispatch off the cloned insert path, always just use
bypass inserts. This still bypasses the bottom level scheduler, which is
what DM wants.

Fixes: ffe81d45322c ("blk-mq: fix corruption with direct issue")
Signed-off-by: Jens Axboe 

---

diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..4c44e6fa0d08 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   return blk_mq_request_issue_directly(rq);
+   blk_mq_request_bypass_insert(rq, true);
+   return BLK_STS_OK;
}
 
spin_lock_irqsave(q->queue_lock, flags);

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 2:04 PM, Jens Axboe wrote:
> On 12/6/18 1:56 PM, Bart Van Assche wrote:
>> On Thu, 2018-12-06 at 08:47 -0800, Bart Van Assche wrote:
>>> If I merge Jens' for-next branch with Linus' master branch, boot the
>>> resulting kernel in a VM and run blktests/tests/srp/002 then that test
>>> never finishes. The same test passes against Linus' master branch. I
>>> think this is a regression. The following appears in the system log if
>>> I run that test:
>>>
>>> Call Trace:
>>> INFO: task kworker/0:1:12 blocked for more than 120 seconds.
>>> Call Trace:
>>> INFO: task ext4lazyinit:2079 blocked for more than 120 seconds.
>>> Call Trace:
>>> INFO: task fio:2151 blocked for more than 120 seconds.
>>> Call Trace:
>>> INFO: task fio:2154 blocked for more than 120 seconds.
>>
>> Hi Jens,
>>
>> My test results so far are as follows:
>> * With kernel v4.20-rc5 test srp/002 passes.
>> * With your for-next branch test srp/002 reports the symptoms reported in my 
>> e-mail.
>> * With Linus' master branch from this morning test srp/002 fails in the same 
>> way as
>>   your for-next branch.
>> * Also with Linus' master branch, test srp/002 passes if I revert the 
>> following commit:
>>   ffe81d45322c ("blk-mq: fix corruption with direct issue"). So it seems 
>> like that
>>   commit fixed one regression but introduced another regression.
> 
> Yes, I'm on the same page, I've been able to reproduce. It seems to be related
> to dm and bypass insert, which is somewhat odd. If I just do:
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index deb56932f8c4..4c44e6fa0d08 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
> request_queue *q, struct request *
>* bypass a potential scheduler on the bottom device for
>* insert.
>*/
> - return blk_mq_request_issue_directly(rq);
> + blk_mq_request_bypass_insert(rq, true);
> + return BLK_STS_OK;
>   }
>  
>   spin_lock_irqsave(q->queue_lock, flags);
> 
> it works fine. Well, at least this regression is less serious, I'll bang
> out a fix for it and ensure we make -rc6. I'm guessing it's the bypassin
> of non-read/write, which your top of dispatch also shows to be a
> non-read/write. But there should be no new failure case here that wasn't
> possible before, only it's easier to hit now.

OK, so here's the thing. As part of the corruption fix, we disallowed
direct dispatch for anything that wasn't a read or write. This means
that your WRITE_ZEROES will always fail direct dispatch. When it does,
we return busy. But next time dm will try the exact same thing again,
blk_insert_cloned_request() -> direct dispatch -> fail. Before we'd succeed
eventually, now we will always fail for that type.

The insert clone path is unique in that regard.

So we have two options - the patch I did above which always just does
bypass insert for DM, or we need to mark the request as having failed
and just not retry direct dispatch for it. I'm still not crazy about
exploring the dispatch insert off this path.

And we'd need to do this on the original request in dm, not the clone
we are passed or it won't be persistent. Hence I lean towards the
already posted patch.

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 1:56 PM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 08:47 -0800, Bart Van Assche wrote:
>> If I merge Jens' for-next branch with Linus' master branch, boot the
>> resulting kernel in a VM and run blktests/tests/srp/002 then that test
>> never finishes. The same test passes against Linus' master branch. I
>> think this is a regression. The following appears in the system log if
>> I run that test:
>>
>> Call Trace:
>> INFO: task kworker/0:1:12 blocked for more than 120 seconds.
>> Call Trace:
>> INFO: task ext4lazyinit:2079 blocked for more than 120 seconds.
>> Call Trace:
>> INFO: task fio:2151 blocked for more than 120 seconds.
>> Call Trace:
>> INFO: task fio:2154 blocked for more than 120 seconds.
> 
> Hi Jens,
> 
> My test results so far are as follows:
> * With kernel v4.20-rc5 test srp/002 passes.
> * With your for-next branch test srp/002 reports the symptoms reported in my 
> e-mail.
> * With Linus' master branch from this morning test srp/002 fails in the same 
> way as
>   your for-next branch.
> * Also with Linus' master branch, test srp/002 passes if I revert the 
> following commit:
>   ffe81d45322c ("blk-mq: fix corruption with direct issue"). So it seems like 
> that
>   commit fixed one regression but introduced another regression.

Yes, I'm on the same page, I've been able to reproduce. It seems to be related
to dm and bypass insert, which is somewhat odd. If I just do:

diff --git a/block/blk-core.c b/block/blk-core.c
index deb56932f8c4..4c44e6fa0d08 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2637,7 +2637,8 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
 * bypass a potential scheduler on the bottom device for
 * insert.
 */
-   return blk_mq_request_issue_directly(rq);
+   blk_mq_request_bypass_insert(rq, true);
+   return BLK_STS_OK;
}
 
spin_lock_irqsave(q->queue_lock, flags);

it works fine. Well, at least this regression is less serious, I'll bang
out a fix for it and ensure we make -rc6. I'm guessing it's the bypassin
of non-read/write, which your top of dispatch also shows to be a
non-read/write. But there should be no new failure case here that wasn't
possible before, only it's easier to hit now.

-- 
Jens Axboe

Re: [PATCH 08/26] aio: don't zero entire aio_kiocb aio_get_req()

2018-12-06 Thread Jens Axboe

On 12/6/18 12:38 PM, Jeff Moyer wrote:
> Jens Axboe  writes:
> 
>> On 12/6/18 12:27 PM, Jeff Moyer wrote:
>>> Jens Axboe  writes:
>>>
>>>> It's 192 bytes, fairly substantial. Most items don't need to be cleared,
>>>> especially not upfront. Clear the ones we do need to clear, and leave
>>>> the other ones for setup when the iocb is prepared and submitted.
>>>
>>> What performance gains do you see from this?
>>
>> Before this, I had 1% in memset doing high IOPS. With it, that's gone.
>> 1% is a lot, when you have just one thread doing everything from submission
>> to completion.
> 
> I'm used to customers complaining about fractions of a percent, so I get
> it.  :-)  I just wanted to know we had some measurable impact, as I've
> seen bugs crop up from code like this in the past.

Oh for sure, I wouldn't do it if there wasn't a noticeable gain from this!


-- 
Jens Axboe

Re: [PATCH 09/26] aio: only use blk plugs for > 2 depth submissions

2018-12-06 Thread Jens Axboe

On 12/6/18 12:29 PM, Jeff Moyer wrote:
> Jens Axboe  writes:
> 
>> Plugging is meant to optimize submission of a string of IOs, if we don't
>> have more than 2 being submitted, don't bother setting up a plug.
> 
> Is there really that much overhead in blk_{start|finish}_plug?

It's not just the overhead of the functions themselves, it's the
pointless work we end up doing for no reason at all. Plugging is, by
defintion, meant to help batch things up, ammortizing the cost of each
individual IO if you have a batch of them. If you don't have a batch,
then there's no point. Normally callers of blk_start_plug() don't
necessarily know how much IO is coming, but in this case we clearly do.

-- 
Jens Axboe

Re: [PATCH 08/26] aio: don't zero entire aio_kiocb aio_get_req()

2018-12-06 Thread Jens Axboe

On 12/6/18 12:27 PM, Jeff Moyer wrote:
> Jens Axboe  writes:
> 
>> It's 192 bytes, fairly substantial. Most items don't need to be cleared,
>> especially not upfront. Clear the ones we do need to clear, and leave
>> the other ones for setup when the iocb is prepared and submitted.
> 
> What performance gains do you see from this?

Before this, I had 1% in memset doing high IOPS. With it, that's gone.
1% is a lot, when you have just one thread doing everything from submission
to completion.

-- 
Jens Axboe

Re: [PATCH 03/26] block: wire up block device iopoll method

2018-12-06 Thread Jens Axboe

On 12/6/18 12:15 PM, Christoph Hellwig wrote:
> On Thu, Dec 06, 2018 at 12:14:29PM -0700, Jens Axboe wrote:
>> On 12/6/18 12:11 PM, Jeff Moyer wrote:
>>> Jens Axboe  writes:
>>>
>>>> From: Christoph Hellwig 
>>>>
>>>> Just call blk_poll on the iocb cookie, we can derive the block device
>>>> from the inode trivially.
>>>
>>> Does this work for multi-device file systems?
>>
>> It should, that's the whole purpose of having fops->iopoll. You should
>> be able to derive it from inode + cookie.
> 
> It still assumes the whole I/O will got to a single device.  For
> XFS this is always tree, but for btrfs this might mean I/O could
> need splitting if ->iopoll was to be supported there.

Right, we can only support one target for one particular piece of IO,
but multi-device works fine in general.

-- 
Jens Axboe

Re: [PATCH 03/26] block: wire up block device iopoll method

2018-12-06 Thread Jens Axboe

On 12/6/18 12:11 PM, Jeff Moyer wrote:
> Jens Axboe  writes:
> 
>> From: Christoph Hellwig 
>>
>> Just call blk_poll on the iocb cookie, we can derive the block device
>> from the inode trivially.
> 
> Does this work for multi-device file systems?

It should, that's the whole purpose of having fops->iopoll. You should
be able to derive it from inode + cookie.

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 11:10 AM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 10:02 -0800, Bart Van Assche wrote:
>> On Thu, 2018-12-06 at 10:48 -0700, Jens Axboe wrote:
>>> which would result in a non-zero exit, which should be expected for this
>>> test?
>>
>> Test srp/002 simulates network failures while running fio on top of dm-mpath.
>> Since queue_if_no_path is enabled in multipath.conf dm-mpath will keep 
>> retrying
>> the block layer requests it receives if these requests fail due to path 
>> removal.
>> If fio reports "io_u error" for this test then that means that the test 
>> failed.
>> I haven't seen fio reporting any I/O errors for this test with any upstream
>> kernel that I tested in the past year or so.
> 
> Hi Jens,
> 
> Please also verify that the version of multipath-tools that you are running 
> includes
> the following patch:

I installed from apt... Says this:

# apt show multipath-tools
Package: multipath-tools
Version: 0.6.4-5+deb9u1

I can run a self-compiled one, if this one doesn't work.

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 11:02 AM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 10:48 -0700, Jens Axboe wrote:
>> which would result in a non-zero exit, which should be expected for this
>> test?
> 
> Hi Jens,
> 
> Test srp/002 simulates network failures while running fio on top of dm-mpath.
> Since queue_if_no_path is enabled in multipath.conf dm-mpath will keep 
> retrying
> the block layer requests it receives if these requests fail due to path 
> removal.
> If fio reports "io_u error" for this test then that means that the test 
> failed.
> I haven't seen fio reporting any I/O errors for this test with any upstream
> kernel that I tested in the past year or so.

Just ran it on current -git, and get the exact same results. Maybe there are
some clues in the log files I sent in the previous email?

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 10:28 AM, Jens Axboe wrote:
> On 12/6/18 10:19 AM, Bart Van Assche wrote:
>> On Thu, 2018-12-06 at 10:00 -0700, Jens Axboe wrote:
>>> On 12/6/18 9:47 AM, Bart Van Assche wrote:
>>>> If I merge Jens' for-next branch with Linus' master branch, boot the
>>>> resulting kernel in a VM and run blktests/tests/srp/002 then that test
>>>> never finishes. The same test passes against Linus' master branch. I
>>>> think this is a regression. The following appears in the system log if
>>>> I run that test:
>>>
>>> You are running that test on a dm device? Can you shed some light on
>>> how that dm device is setup?
>>
>> Hi Jens,
>>
>> The dm device referred to in my e-mail is a dm-mpath device created by test
>> srp/002. All parameters of that device are under control of that test script.
>> From the srp/002 test script:
>>
>> DESCRIPTION="File I/O on top of multipath concurrently with logout and login 
>> (mq)"
>>
>> From srp/multipath.conf:
>>
>> defaults {
>>  find_multipaths no
>>  user_friendly_names yes
>>  queue_without_daemonno
>> }
>>
>> devices {
>>  device {
>>  vendor  "LIO-ORG|SCST_BIO|FUSIONIO"
>>  product ".*"
>>  features"1 queue_if_no_path"
>>  path_checkertur
>>  }
>> }
> 
> 5 kernel compiles later, and I think I'm almost ready to run it...

axboe@dell:/home/axboe/git/blktests $ sudo ./check srp/002
srp/002 (File I/O on top of multipath concurrently with logout and login (mq)) 
[failed]
runtime  33.689s  ...  32.212s
--- tests/srp/002.out   2018-11-09 08:42:22.842029804 -0700
+++ /home/axboe/git/blktests/results/nodev/srp/002.out.bad  2018-12-06 
10:45:35.995183521 -0700
@@ -1,2 +1 @@
 Configured SRP target driver
-Passed

It runs for about 32-33s every time and then I get this. The fio output shows 
this:

data-integrity-test-mq: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 
4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=64
...
fio-3.12-27-g68af
Starting 16 threads
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
data-integrity-test-mq: Laying out IO file (1 file / 1MiB)
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.14.0:
 Input/output error: read offset=724992, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.5.0:
 Input/output error: read offset=983040, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.7.0:
 Input/output error: write offset=700416, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.8.0:
 Input/output error: write offset=237568, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.9.0:
 Input/output error: write offset=1277952, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.9.0:
 Input/output error: write offset=970752, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.9.0:
 Input/output error: write offset=974848, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.9.0:
 Input/output error: write offset=978944, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.9.0:
 Input/output error: write offset=983040, buflen=4096
fio: io_u error on file 
/home/axboe/git/blktests/results/tmpdir.srp.002.7WR/mnt0/data-integrity-test-mq.15.0:
 Input/output error: write offset=172032, buflen=4096
fio: io_u error on file 
/home/axboe/

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 10:19 AM, Bart Van Assche wrote:
> On Thu, 2018-12-06 at 10:00 -0700, Jens Axboe wrote:
>> On 12/6/18 9:47 AM, Bart Van Assche wrote:
>>> If I merge Jens' for-next branch with Linus' master branch, boot the
>>> resulting kernel in a VM and run blktests/tests/srp/002 then that test
>>> never finishes. The same test passes against Linus' master branch. I
>>> think this is a regression. The following appears in the system log if
>>> I run that test:
>>
>> You are running that test on a dm device? Can you shed some light on
>> how that dm device is setup?
> 
> Hi Jens,
> 
> The dm device referred to in my e-mail is a dm-mpath device created by test
> srp/002. All parameters of that device are under control of that test script.
> From the srp/002 test script:
> 
> DESCRIPTION="File I/O on top of multipath concurrently with logout and login 
> (mq)"
> 
> From srp/multipath.conf:
> 
> defaults {
>   find_multipaths no
>   user_friendly_names yes
>   queue_without_daemonno
> }
> 
> devices {
>   device {
>   vendor  "LIO-ORG|SCST_BIO|FUSIONIO"
>   product ".*"
>   features"1 queue_if_no_path"
>   path_checkertur
>   }
> }

5 kernel compiles later, and I think I'm almost ready to run it...

-- 
Jens Axboe

Re: for-next branch and blktests/srp

2018-12-06 Thread Jens Axboe

On 12/6/18 9:47 AM, Bart Van Assche wrote:
> Hello,
> 
> If I merge Jens' for-next branch with Linus' master branch, boot the
> resulting kernel in a VM and run blktests/tests/srp/002 then that test
> never finishes. The same test passes against Linus' master branch. I
> think this is a regression. The following appears in the system log if
> I run that test:

You are running that test on a dm device? Can you shed some light on
how that dm device is setup?

-- 
Jens Axboe

Re: for-next branch and throttling

2018-12-06 Thread Jens Axboe

On 12/6/18 9:39 AM, Bart Van Assche wrote:
> Hello,
> 
> If I merge Jens' for-next branch with Linus' master branch and boot the
> resulting kernel in a VM then the call trace shown below appears. This call
> trace does not appear when building and booting the code from Linus' master
> branch. Is this perhaps a regression?
> 
> WARNING: CPU: 1 PID: 257 at block/blk-throttle.c:2128 
> blk_throtl_bio+0xc00/0x1120
> Modules linked in: floppy(+) virtio_blk(+) virtio_net net_failover failover 
> i2c_piix4 pata_acpi
> CPU: 1 PID: 257 Comm: systemd-udevd Not tainted 4.20.0-rc5-dbg+ #1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 
> 04/01/2014
> RIP: 0010:blk_throtl_bio+0xc00/0x1120

It's some over-zealous rcu removal, simple fix is coming. CC Dennis.

-- 
Jens Axboe

[GIT PULL] Block fixes for 4.20-rc6

2018-12-05 Thread Jens Axboe

Hi Linus,

A bit earlier in the week as usual, but there's a fix here that should
go in sooner rather than later. Under a combination of circumstance, the
direct issue path in blk-mq could corrupt data. This wasn't easy to hit,
but the ones that are affected by it, seem to hit it pretty easily. Full
explanation in the patch. None of the regular filesystem and storage
testing has triggered it, even though it's been around since 4.19-rc1.

Outside of that, whitelist trim tweak for certain Samsung devices for
libata.

Please pull!


  git://git.kernel.dk/linux-block.git tags/for-linus-20181205



Jens Axboe (1):
  blk-mq: fix corruption with direct issue

Juha-Matti Tilli (1):
  libata: whitelist all SAMSUNG MZ7KM* solid-state disks

 block/blk-mq.c| 26 +-
 drivers/ata/libata-core.c |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-05 Thread Jens Axboe

On 12/5/18 12:09 PM, Guenter Roeck wrote:
> On Wed, Dec 05, 2018 at 10:59:21AM -0700, Jens Axboe wrote:
> [ ... ]
>>
>>> Also, it seems to me that even with this problem fixed, blk-mq may not
>>> be ready for primetime after all. With that in mind, maybe commit
>>> d5038a13eca72 ("scsi: core: switch to scsi-mq by default") was a
>>> bit premature. Should that be reverted ?
>>
>> I have to strongly disagree with that, the timing is just unfortunate.
>> There are literally millions of machines running blk-mq/scsi-mq, and
>> this is the only hickup we've had. So I want to put this one to rest
>> once and for all, there's absolutely no reason not to continue with
>> what we've planned.
>>
> 
> Guess we have to agree to disagree. In my opinion, for infrastructure
> as critical as this, a single hickup is one hickup too many. Not that
> I would describe this as hickup in the first place; I would describe
> it as major disaster.

Don't get me wrong, I don't mean to use hickup in a diminishing fashion,
this was by all means a disaster for the ones hit by it. But if you look
at the scope of how many folks are using blk-mq/scsi-mq and have been
for years, we're really talking about a tiny tiny percentage here. This
could just as easily have happened with the old IO stack. The bug was a
freak accident, and even with full knowledge of why it happened, I'm
still having an extraordinarily hard time triggering it at will on my
test boxes. As with any disaster, it's usually a combination of multiple
things that go wrong, and this one is no different. The folks that hit
this generally hit it pretty easily, and (by far) the majority would
never hit it.

Bugs happen, whether you like it or not. They happen in file systems,
memory management, and they happen in storage. Things are continually
developed, and that sometimes introduces bugs. We do our best to ensure
that doesn't happen, but sometimes freak accidents like this happen. I
think my track record of decades of work speaks for itself there, it's
not like this is a frequent occurence. And if this particular issue
wasn't well understood and instead just reverted the offending commits,
then I would agree with you. But that's not the case. I'm very confident
in the stability, among other things, of blk-mq and the drivers that
utilize it. Most of the storage drivers are using it today, and have
been for a long time.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-05 Thread Jens Axboe

On 12/5/18 10:55 AM, Guenter Roeck wrote:
> On Tue, Dec 04, 2018 at 07:25:05PM -0700, Jens Axboe wrote:
>> On 12/4/18 6:38 PM, Guenter Roeck wrote:
>>> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>>>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>>>> we queue the request up normally. However, the SCSI layer may have
>>>> already setup SG tables etc for this particular command. If we later
>>>> merge with this request, then the old tables are no longer valid. Once
>>>> we issue the IO, we only read/write the original part of the request,
>>>> not the new state of it.
>>>>
>>>> This causes data corruption, and is most often noticed with the file
>>>> system complaining about the just read data being invalid:
>>>>
>>>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>>>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>>>
>>>> because most of it is garbage...
>>>>
>>>> This doesn't happen from the normal issue path, as we will simply defer
>>>> the request to the hardware queue dispatch list if we fail. Once it's on
>>>> the dispatch list, we never merge with it.
>>>>
>>>> Fix this from the direct issue path by flagging the request as
>>>> REQ_NOMERGE so we don't change the size of it before issue.
>>>>
>>>> See also:
>>>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>>>
>>>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case 
>>>> of 'none'")
>>>> Signed-off-by: Jens Axboe 
>>>
>>> Tested-by: Guenter Roeck 
>>>
>>> ... on two systems affected by the problem.
>>
>> Thanks for testing! And for being persistent in reproducing and
>> providing clues for getting this nailed.
>>
> 
> My pleasure.
> 
> I see that there is some discussion about this patch.
> 
> Unfortunately, everyone running a 4.19 or later kernel is at serious
> risk of data corruption. Given that, if this patch doesn't make it
> upstream for one reason or another, would it be possible to at least
> revert the two patches introducing the problem until this is sorted
> out for good ? If this is not acceptable either, maybe mark blk-mq
> as broken ? After all, it _is_ broken. This is even more true if it
> turns out that a problem may exist since 4.1, as suggested in the
> discussion.

It is queued up, it'll go upstream later today.

> Also, it seems to me that even with this problem fixed, blk-mq may not
> be ready for primetime after all. With that in mind, maybe commit
> d5038a13eca72 ("scsi: core: switch to scsi-mq by default") was a
> bit premature. Should that be reverted ?

I have to strongly disagree with that, the timing is just unfortunate.
There are literally millions of machines running blk-mq/scsi-mq, and
this is the only hickup we've had. So I want to put this one to rest
once and for all, there's absolutely no reason not to continue with
what we've planned.

-- 
Jens Axboe

Re: [PATCH 0/3] Unify the throttling code for wbt and io-latency

2018-12-05 Thread Jens Axboe

On 12/4/18 10:59 AM, Josef Bacik wrote:
> Originally when I wrote io-latency and the rq_qos code to provide a common 
> base
> between wbt and io-latency I left out the throttling part.  These were 
> basically
> the same, but slightly different in both cases.  The difference was enough and
> the code wasn't too complicated that I just copied it into io-latency and
> modified it for what I needed and carried on.
> 
> Since then Jens has fixed a few issues with wakeups with the niave approach.
> Before you could easily cycle waiters back to the end of the line if they were
> woken up without the ability to actually do their IO yet.  But because this 
> was
> only in wbt we didn't get it in io-latency.
> 
> Resolve this by creating a unified interface for doing the throttling, and 
> then
> just handle the differences between the two users with user specific 
> callbacks.
> This allows us to have one place where we have to mess with wakeups, and gives
> each user the ability to be their own special snowflake.
> 
> Jens, I based this on for-next from 12/03, let me know if you want a different
> base.  I tested this with my blktests test.  Thanks,

Applies fine to for-4.21/block, which is what I care about. Looks good to me,
applied, thanks.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-05 Thread Jens Axboe

On 12/5/18 7:41 AM, Christoph Hellwig wrote:
> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>> we queue the request up normally. However, the SCSI layer may have
>> already setup SG tables etc for this particular command. If we later
>> merge with this request, then the old tables are no longer valid. Once
>> we issue the IO, we only read/write the original part of the request,
>> not the new state of it.
>>
>> This causes data corruption, and is most often noticed with the file
>> system complaining about the just read data being invalid:
> 
> Looks good as the first quick fix:
> 
> Reviewed-by: Christoph Hellwig 
> 
> But as we discussed I think we need to be even more careful. including
> the only read and write check below in the thread or some higher level
> approach.

I agree, I'm reviewing the patches from Jianchao now and will do
something cleaner on top of that.

-- 
Jens Axboe

Re: [PATCH 15/26] aio: support for IO polling

2018-12-05 Thread Jens Axboe

On 12/5/18 2:56 AM, Benny Halevy wrote:
>> +static int aio_iopoll_getevents(struct kioctx *ctx,
>> +struct io_event __user *event,
>> +unsigned int *nr_events, long min, long max)
>> +{
>> +struct aio_kiocb *iocb;
>> +int to_poll, polled, ret;
>> +
>> +/*
>> + * Check if we already have done events that satisfy what we need
>> + */
>> +if (!list_empty(>poll_completing)) {
>> +ret = aio_iopoll_reap(ctx, event, nr_events, max);
>> +if (ret < 0)
>> +return ret;
>> +/* if min is zero, still go through a poll check */
> 
> A function-level comment about that would be more visible.

Yes, that might be a better idea.

>> +if (min && *nr_events >= min)
>>
>> +return 0;
> 
> if ((min && *nr_events >= min) || *nr_events >= max) ?
> 
> Otherwise, if poll_completing or poll_submitted are not empty,
> besides getting to the "Shouldn't happen" case in aio_iopoll_reap,
> it looks like we're gonna waste some cycles and never call f_op->iopoll

Agree, that would be a better place to catch it. I'll make those two
changes, thanks!

>> +
>> +/*
>> + * Find up to 'max' worth of events to poll for, including the
>> + * events we already successfully polled
>> + */
>> +polled = to_poll = 0;
>> +list_for_each_entry(iocb, >poll_completing, ki_list) {
>> +/*
>> + * Poll for needed events with spin == true, anything after
>> + * that we just check if we have more, up to max.
>> + */
>> +bool spin = polled + *nr_events >= min;
> 
> I'm confused. Shouldn't spin == true if polled + *nr_events < min?

Heh, that does look off, and it is. I think the correct condition is:

bool spin = !polled || *nr_events < min;

and I'm not sure why that got broken. I just tested it, slight
performance win as expected correcting this. It ties in with the above
nr_events check - it's perfectly valid to pass min_nr == 0 and just do a
poll check. Conversely, if we already spun, just do non-spin checks on
followup poll checks.

>> +static int __aio_iopoll_check(struct kioctx *ctx, struct io_event __user 
>> *event,
>> +  unsigned int *nr_events, long min_nr, long max_nr)
>> +{
>> +int ret = 0;
>> +
>> +while (!*nr_events || !need_resched()) {
>> +int tmin = 0;
>> +
>> +if (*nr_events < min_nr)
>> +tmin = min_nr - *nr_events;
> 
> Otherwise, shouldn't the function just return 0?
> 
> Or perhaps you explicitly want to go through the tmin==0 case
> in aio_iopoll_getevents if *nr_events == min_nr (or min_nr==0)?

No, we need to go through poll, if min_nr == 0, but only if min_nr == 0
and !nr_events. But we only get here for nr_events != 0, so should be
fine as-is.

Thanks for your review, very useful! I'll make the above changes right
now.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 8:03 PM, Ming Lei wrote:
> On Wed, Dec 05, 2018 at 10:58:02AM +0800, Ming Lei wrote:
>> On Tue, Dec 04, 2018 at 07:30:24PM -0700, Jens Axboe wrote:
>>> On 12/4/18 7:27 PM, Ming Lei wrote:
>>>> On Tue, Dec 04, 2018 at 07:16:11PM -0700, Jens Axboe wrote:
>>>>> On 12/4/18 6:37 PM, Ming Lei wrote:
>>>>>> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>>>>>>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>>>>>>> we queue the request up normally. However, the SCSI layer may have
>>>>>>> already setup SG tables etc for this particular command. If we later
>>>>>>> merge with this request, then the old tables are no longer valid. Once
>>>>>>> we issue the IO, we only read/write the original part of the request,
>>>>>>> not the new state of it.
>>>>>>>
>>>>>>> This causes data corruption, and is most often noticed with the file
>>>>>>> system complaining about the just read data being invalid:
>>>>>>>
>>>>>>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode 
>>>>>>> #7142: comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>>>>>>
>>>>>>> because most of it is garbage...
>>>>>>>
>>>>>>> This doesn't happen from the normal issue path, as we will simply defer
>>>>>>> the request to the hardware queue dispatch list if we fail. Once it's on
>>>>>>> the dispatch list, we never merge with it.
>>>>>>>
>>>>>>> Fix this from the direct issue path by flagging the request as
>>>>>>> REQ_NOMERGE so we don't change the size of it before issue.
>>>>>>>
>>>>>>> See also:
>>>>>>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>>>>>>
>>>>>>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in 
>>>>>>> case of 'none'")
>>>>>>> Signed-off-by: Jens Axboe 
>>>>>>>
>>>>>>> ---
>>>>>>>
>>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>>> index 3f91c6e5b17a..d8f518c6ea38 100644
>>>>>>> --- a/block/blk-mq.c
>>>>>>> +++ b/block/blk-mq.c
>>>>>>> @@ -1715,6 +1715,15 @@ static blk_status_t 
>>>>>>> __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
>>>>>>> break;
>>>>>>> case BLK_STS_RESOURCE:
>>>>>>> case BLK_STS_DEV_RESOURCE:
>>>>>>> +   /*
>>>>>>> +* If direct dispatch fails, we cannot allow any 
>>>>>>> merging on
>>>>>>> +* this IO. Drivers (like SCSI) may have set up 
>>>>>>> permanent state
>>>>>>> +* for this request, like SG tables and mappings, and 
>>>>>>> if we
>>>>>>> +* merge to it later on then we'll still only do IO to 
>>>>>>> the
>>>>>>> +* original part.
>>>>>>> +*/
>>>>>>> +   rq->cmd_flags |= REQ_NOMERGE;
>>>>>>> +
>>>>>>> blk_mq_update_dispatch_busy(hctx, true);
>>>>>>> __blk_mq_requeue_request(rq);
>>>>>>> break;
>>>>>>>
>>>>>>
>>>>>> Not sure it is enough to just mark it as NOMERGE, for example, driver
>>>>>> may have setup the .special_vec for discard, and NOMERGE may not prevent
>>>>>> request from entering elevator queue completely. Cause 'rq.rb_node' and
>>>>>> 'rq.special_vec' share same space.
>>>>>
>>>>> We should rather limit the scope of the direct dispatch instead. It
>>>>> doesn't make sense to do for anything but read/write anyway.
>>>>
>>>> discard is kind of write, and it isn't treated very specially in make
>>>> request path, except for multi-range discard.
>>>
>>> The point of direct dispatch is to reduce latencies for requests,
>>> discards are so damn slow

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 7:58 PM, Ming Lei wrote:
> On Tue, Dec 04, 2018 at 07:30:24PM -0700, Jens Axboe wrote:
>> On 12/4/18 7:27 PM, Ming Lei wrote:
>>> On Tue, Dec 04, 2018 at 07:16:11PM -0700, Jens Axboe wrote:
>>>> On 12/4/18 6:37 PM, Ming Lei wrote:
>>>>> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>>>>>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>>>>>> we queue the request up normally. However, the SCSI layer may have
>>>>>> already setup SG tables etc for this particular command. If we later
>>>>>> merge with this request, then the old tables are no longer valid. Once
>>>>>> we issue the IO, we only read/write the original part of the request,
>>>>>> not the new state of it.
>>>>>>
>>>>>> This causes data corruption, and is most often noticed with the file
>>>>>> system complaining about the just read data being invalid:
>>>>>>
>>>>>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>>>>>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>>>>>
>>>>>> because most of it is garbage...
>>>>>>
>>>>>> This doesn't happen from the normal issue path, as we will simply defer
>>>>>> the request to the hardware queue dispatch list if we fail. Once it's on
>>>>>> the dispatch list, we never merge with it.
>>>>>>
>>>>>> Fix this from the direct issue path by flagging the request as
>>>>>> REQ_NOMERGE so we don't change the size of it before issue.
>>>>>>
>>>>>> See also:
>>>>>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>>>>>
>>>>>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in 
>>>>>> case of 'none'")
>>>>>> Signed-off-by: Jens Axboe 
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>> index 3f91c6e5b17a..d8f518c6ea38 100644
>>>>>> --- a/block/blk-mq.c
>>>>>> +++ b/block/blk-mq.c
>>>>>> @@ -1715,6 +1715,15 @@ static blk_status_t 
>>>>>> __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
>>>>>>  break;
>>>>>>  case BLK_STS_RESOURCE:
>>>>>>  case BLK_STS_DEV_RESOURCE:
>>>>>> +/*
>>>>>> + * If direct dispatch fails, we cannot allow any 
>>>>>> merging on
>>>>>> + * this IO. Drivers (like SCSI) may have set up 
>>>>>> permanent state
>>>>>> + * for this request, like SG tables and mappings, and 
>>>>>> if we
>>>>>> + * merge to it later on then we'll still only do IO to 
>>>>>> the
>>>>>> + * original part.
>>>>>> + */
>>>>>> +rq->cmd_flags |= REQ_NOMERGE;
>>>>>> +
>>>>>>  blk_mq_update_dispatch_busy(hctx, true);
>>>>>>  __blk_mq_requeue_request(rq);
>>>>>>  break;
>>>>>>
>>>>>
>>>>> Not sure it is enough to just mark it as NOMERGE, for example, driver
>>>>> may have setup the .special_vec for discard, and NOMERGE may not prevent
>>>>> request from entering elevator queue completely. Cause 'rq.rb_node' and
>>>>> 'rq.special_vec' share same space.
>>>>
>>>> We should rather limit the scope of the direct dispatch instead. It
>>>> doesn't make sense to do for anything but read/write anyway.
>>>
>>> discard is kind of write, and it isn't treated very specially in make
>>> request path, except for multi-range discard.
>>
>> The point of direct dispatch is to reduce latencies for requests,
>> discards are so damn slow on ALL devices anyway that it doesn't make any
>> sense to try direct dispatch to begin with, regardless of whether it
>> possible or not.
> 
> SCSI MQ device may benefit from direct dispatch from reduced lock contention.
> 
>>
>>>>> So how about inserting this request via blk_mq_request_bypass_insert()
&g

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 7:27 PM, Ming Lei wrote:
> On Tue, Dec 04, 2018 at 07:16:11PM -0700, Jens Axboe wrote:
>> On 12/4/18 6:37 PM, Ming Lei wrote:
>>> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>>>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>>>> we queue the request up normally. However, the SCSI layer may have
>>>> already setup SG tables etc for this particular command. If we later
>>>> merge with this request, then the old tables are no longer valid. Once
>>>> we issue the IO, we only read/write the original part of the request,
>>>> not the new state of it.
>>>>
>>>> This causes data corruption, and is most often noticed with the file
>>>> system complaining about the just read data being invalid:
>>>>
>>>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>>>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>>>
>>>> because most of it is garbage...
>>>>
>>>> This doesn't happen from the normal issue path, as we will simply defer
>>>> the request to the hardware queue dispatch list if we fail. Once it's on
>>>> the dispatch list, we never merge with it.
>>>>
>>>> Fix this from the direct issue path by flagging the request as
>>>> REQ_NOMERGE so we don't change the size of it before issue.
>>>>
>>>> See also:
>>>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>>>
>>>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case 
>>>> of 'none'")
>>>> Signed-off-by: Jens Axboe 
>>>>
>>>> ---
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 3f91c6e5b17a..d8f518c6ea38 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -1715,6 +1715,15 @@ static blk_status_t __blk_mq_issue_directly(struct 
>>>> blk_mq_hw_ctx *hctx,
>>>>break;
>>>>case BLK_STS_RESOURCE:
>>>>case BLK_STS_DEV_RESOURCE:
>>>> +  /*
>>>> +   * If direct dispatch fails, we cannot allow any merging on
>>>> +   * this IO. Drivers (like SCSI) may have set up permanent state
>>>> +   * for this request, like SG tables and mappings, and if we
>>>> +   * merge to it later on then we'll still only do IO to the
>>>> +   * original part.
>>>> +   */
>>>> +  rq->cmd_flags |= REQ_NOMERGE;
>>>> +
>>>>blk_mq_update_dispatch_busy(hctx, true);
>>>>__blk_mq_requeue_request(rq);
>>>>break;
>>>>
>>>
>>> Not sure it is enough to just mark it as NOMERGE, for example, driver
>>> may have setup the .special_vec for discard, and NOMERGE may not prevent
>>> request from entering elevator queue completely. Cause 'rq.rb_node' and
>>> 'rq.special_vec' share same space.
>>
>> We should rather limit the scope of the direct dispatch instead. It
>> doesn't make sense to do for anything but read/write anyway.
> 
> discard is kind of write, and it isn't treated very specially in make
> request path, except for multi-range discard.

The point of direct dispatch is to reduce latencies for requests,
discards are so damn slow on ALL devices anyway that it doesn't make any
sense to try direct dispatch to begin with, regardless of whether it
possible or not.

>>> So how about inserting this request via blk_mq_request_bypass_insert()
>>> in case that direct issue returns BUSY? Then it is invariant that
>>> any request queued via .queue_rq() won't enter scheduler queue.
>>
>> I did consider this, but I didn't want to experiment with exercising
>> a new path for an important bug fix. You do realize that your original
>> patch has been corrupting data for months? I think a little caution
>> is in order here.
> 
> But marking NOMERGE still may have a hole on re-insert discard request as
> mentioned above.

What I said was further limit the scope of direct dispatch, which means
not allowing anything that isn't a read/write.

> Given we never allow to re-insert queued request to scheduler queue
> except for 6ce3dd6eec1, I think it is the correct thing to do, and the
> fix is simple too.

As I said, it's not the time to experiment. This issue has been there
since 4.19-rc1. The alternative is yanking both those patches, and then
looking at it later when the direct issue path has been cleaned up
first.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 6:38 PM, Guenter Roeck wrote:
> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>> we queue the request up normally. However, the SCSI layer may have
>> already setup SG tables etc for this particular command. If we later
>> merge with this request, then the old tables are no longer valid. Once
>> we issue the IO, we only read/write the original part of the request,
>> not the new state of it.
>>
>> This causes data corruption, and is most often noticed with the file
>> system complaining about the just read data being invalid:
>>
>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>
>> because most of it is garbage...
>>
>> This doesn't happen from the normal issue path, as we will simply defer
>> the request to the hardware queue dispatch list if we fail. Once it's on
>> the dispatch list, we never merge with it.
>>
>> Fix this from the direct issue path by flagging the request as
>> REQ_NOMERGE so we don't change the size of it before issue.
>>
>> See also:
>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>
>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case 
>> of 'none'")
>> Signed-off-by: Jens Axboe 
> 
> Tested-by: Guenter Roeck 
> 
> ... on two systems affected by the problem.

Thanks for testing! And for being persistent in reproducing and
providing clues for getting this nailed.

-- 
Jens Axboe

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 7:16 PM, Jens Axboe wrote:
> On 12/4/18 6:37 PM, Ming Lei wrote:
>> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>>> we queue the request up normally. However, the SCSI layer may have
>>> already setup SG tables etc for this particular command. If we later
>>> merge with this request, then the old tables are no longer valid. Once
>>> we issue the IO, we only read/write the original part of the request,
>>> not the new state of it.
>>>
>>> This causes data corruption, and is most often noticed with the file
>>> system complaining about the just read data being invalid:
>>>
>>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>>
>>> because most of it is garbage...
>>>
>>> This doesn't happen from the normal issue path, as we will simply defer
>>> the request to the hardware queue dispatch list if we fail. Once it's on
>>> the dispatch list, we never merge with it.
>>>
>>> Fix this from the direct issue path by flagging the request as
>>> REQ_NOMERGE so we don't change the size of it before issue.
>>>
>>> See also:
>>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>>
>>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case 
>>> of 'none'")
>>> Signed-off-by: Jens Axboe 
>>>
>>> ---
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 3f91c6e5b17a..d8f518c6ea38 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -1715,6 +1715,15 @@ static blk_status_t __blk_mq_issue_directly(struct 
>>> blk_mq_hw_ctx *hctx,
>>> break;
>>> case BLK_STS_RESOURCE:
>>> case BLK_STS_DEV_RESOURCE:
>>> +   /*
>>> +* If direct dispatch fails, we cannot allow any merging on
>>> +* this IO. Drivers (like SCSI) may have set up permanent state
>>> +* for this request, like SG tables and mappings, and if we
>>> +* merge to it later on then we'll still only do IO to the
>>> +* original part.
>>> +*/
>>> +   rq->cmd_flags |= REQ_NOMERGE;
>>> +
>>> blk_mq_update_dispatch_busy(hctx, true);
>>> __blk_mq_requeue_request(rq);
>>> break;
>>>
>>
>> Not sure it is enough to just mark it as NOMERGE, for example, driver
>> may have setup the .special_vec for discard, and NOMERGE may not prevent
>> request from entering elevator queue completely. Cause 'rq.rb_node' and
>> 'rq.special_vec' share same space.
> 
> We should rather limit the scope of the direct dispatch instead. It
> doesn't make sense to do for anything but read/write anyway.
> 
>> So how about inserting this request via blk_mq_request_bypass_insert()
>> in case that direct issue returns BUSY? Then it is invariant that
>> any request queued via .queue_rq() won't enter scheduler queue.
> 
> I did consider this, but I didn't want to experiment with exercising
> a new path for an important bug fix. You do realize that your original
> patch has been corrupting data for months? I think a little caution
> is in order here.

Here's a further limiting version. And we seriously need to clean up the
direct issue paths, it's ridiculous.


diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f91c6e5b17a..3262d83b9e07 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1715,6 +1715,15 @@ static blk_status_t __blk_mq_issue_directly(struct 
blk_mq_hw_ctx *hctx,
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
+   /*
+* If direct dispatch fails, we cannot allow any merging on
+* this IO. Drivers (like SCSI) may have set up permanent state
+* for this request, like SG tables and mappings, and if we
+* merge to it later on then we'll still only do IO to the
+* original part.
+*/
+   rq->cmd_flags |= REQ_NOMERGE;
+
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
@@ -1727,6 +1736,18 @@ static blk_status_t __blk_mq_issue_directly(struct 
blk_mq_hw_ctx *hctx,
return ret;
 }
 
+/*
+ * Don't allow direct dispatch of anything but regular reads/writes,
+ * as some of the other commands can potentially share request space
+ * w

Re: [PATCH] blk-mq: fix corruption with direct issue

2018-12-04 Thread Jens Axboe

On 12/4/18 6:37 PM, Ming Lei wrote:
> On Tue, Dec 04, 2018 at 03:47:46PM -0700, Jens Axboe wrote:
>> If we attempt a direct issue to a SCSI device, and it returns BUSY, then
>> we queue the request up normally. However, the SCSI layer may have
>> already setup SG tables etc for this particular command. If we later
>> merge with this request, then the old tables are no longer valid. Once
>> we issue the IO, we only read/write the original part of the request,
>> not the new state of it.
>>
>> This causes data corruption, and is most often noticed with the file
>> system complaining about the just read data being invalid:
>>
>> [  235.934465] EXT4-fs error (device sda1): ext4_iget:4831: inode #7142: 
>> comm dpkg-query: bad extra_isize 24937 (inode size 256)
>>
>> because most of it is garbage...
>>
>> This doesn't happen from the normal issue path, as we will simply defer
>> the request to the hardware queue dispatch list if we fail. Once it's on
>> the dispatch list, we never merge with it.
>>
>> Fix this from the direct issue path by flagging the request as
>> REQ_NOMERGE so we don't change the size of it before issue.
>>
>> See also:
>>   https://bugzilla.kernel.org/show_bug.cgi?id=201685
>>
>> Fixes: 6ce3dd6eec1 ("blk-mq: issue directly if hw queue isn't busy in case 
>> of 'none'")
>> Signed-off-by: Jens Axboe 
>>
>> ---
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 3f91c6e5b17a..d8f518c6ea38 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -1715,6 +1715,15 @@ static blk_status_t __blk_mq_issue_directly(struct 
>> blk_mq_hw_ctx *hctx,
>>  break;
>>  case BLK_STS_RESOURCE:
>>  case BLK_STS_DEV_RESOURCE:
>> +/*
>> + * If direct dispatch fails, we cannot allow any merging on
>> + * this IO. Drivers (like SCSI) may have set up permanent state
>> + * for this request, like SG tables and mappings, and if we
>> + * merge to it later on then we'll still only do IO to the
>> + * original part.
>> + */
>> +rq->cmd_flags |= REQ_NOMERGE;
>> +
>>  blk_mq_update_dispatch_busy(hctx, true);
>>  __blk_mq_requeue_request(rq);
>>  break;
>>
> 
> Not sure it is enough to just mark it as NOMERGE, for example, driver
> may have setup the .special_vec for discard, and NOMERGE may not prevent
> request from entering elevator queue completely. Cause 'rq.rb_node' and
> 'rq.special_vec' share same space.

We should rather limit the scope of the direct dispatch instead. It
doesn't make sense to do for anything but read/write anyway.

> So how about inserting this request via blk_mq_request_bypass_insert()
> in case that direct issue returns BUSY? Then it is invariant that
> any request queued via .queue_rq() won't enter scheduler queue.

I did consider this, but I didn't want to experiment with exercising
a new path for an important bug fix. You do realize that your original
patch has been corrupting data for months? I think a little caution
is in order here.

-- 
Jens Axboe

[PATCH 26/26] aio: add support for submission/completion rings

2018-12-04 Thread Jens Axboe

Experimental support for submitting and completing IO through rings
shared between the application and kernel.

The submission rings are struct iocb, like we would submit through
io_submit(), and the completion rings are struct io_event, like we
would pass in (and copy back) from io_getevents().

A new system call is added for this, io_ring_enter(). This system
call submits IO that is queued in the SQ ring, and/or completes IO
and stores the results in the CQ ring.

This could be augmented with a kernel thread that does the submission
and polling, then the application would never have to enter the
kernel to do IO.

Sample application: http://brick.kernel.dk/snaps/aio-ring.c

Signed-off-by: Jens Axboe 
---
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 fs/aio.c   | 312 +++--
 include/linux/syscalls.h   |   4 +-
 include/uapi/linux/aio_abi.h   |  26 +++
 4 files changed, 323 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 67c357225fb0..55a26700a637 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -344,6 +344,7 @@
 333common  io_pgetevents   __x64_sys_io_pgetevents
 334common  rseq__x64_sys_rseq
 335common  io_setup2   __x64_sys_io_setup2
+336common  io_ring_enter   __x64_sys_io_ring_enter
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index 39aaffd6d436..6024c6943d7d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -142,6 +142,10 @@ struct kioctx {
 
struct aio_mapped_range iocb_range;
 
+   /* if used, completion and submission rings */
+   struct aio_mapped_range sq_ring;
+   struct aio_mapped_range cq_ring;
+
struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
@@ -297,6 +301,8 @@ static const struct address_space_operations aio_ctx_aops;
 
 static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
+static const unsigned int event_page_shift =
+   ilog2(PAGE_SIZE / sizeof(struct io_event));
 
 /*
  * We rely on block level unplugs to flush pending requests, if we schedule
@@ -307,6 +313,7 @@ static const bool aio_use_state_req_list = true;
 static const bool aio_use_state_req_list = false;
 #endif
 
+static void aio_scqring_unmap(struct kioctx *);
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
 static void aio_iocb_buffer_unmap(struct kioctx *);
@@ -673,6 +680,7 @@ static void free_ioctx(struct work_struct *work)
 
aio_iocb_buffer_unmap(ctx);
aio_useriocb_unmap(ctx);
+   aio_scqring_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(>reqs);
@@ -1218,6 +1226,47 @@ static void aio_fill_event(struct io_event *ev, struct 
aio_kiocb *iocb,
ev->res2 = res2;
 }
 
+static struct io_event *__aio_get_cqring_ev(struct aio_io_event_ring *ring,
+   struct aio_mapped_range *range,
+   unsigned *next_tail)
+{
+   struct io_event *ev;
+   unsigned tail;
+
+   smp_rmb();
+   tail = READ_ONCE(ring->tail);
+   *next_tail = tail + 1;
+   if (*next_tail == ring->nr_events)
+   *next_tail = 0;
+   if (*next_tail == READ_ONCE(ring->head))
+   return NULL;
+
+   /* io_event array starts offset one into the mapped range */
+   tail++;
+   ev = page_address(range->pages[tail >> event_page_shift]);
+   tail &= ((1 << event_page_shift) - 1);
+   return ev + tail;
+}
+
+static void aio_commit_cqring(struct kioctx *ctx, unsigned next_tail)
+{
+   struct aio_io_event_ring *ring;
+
+   ring = page_address(ctx->cq_ring.pages[0]);
+   if (next_tail != ring->tail) {
+   ring->tail = next_tail;
+   smp_wmb();
+   }
+}
+
+static struct io_event *aio_peek_cqring(struct kioctx *ctx, unsigned *ntail)
+{
+   struct aio_io_event_ring *ring;
+
+   ring = page_address(ctx->cq_ring.pages[0]);
+   return __aio_get_cqring_ev(ring, >cq_ring, ntail);
+}
+
 static void aio_ring_complete(struct kioctx *ctx, struct aio_kiocb *iocb,
  long res, long res2)
 {
@@ -1279,7 +1328,17 @@ static void aio_complete(struct aio_kiocb *iocb, long 
res, long res2)
 {
struct kioctx *ctx = iocb->ki_ctx;
 
-   aio_ring_complete(ctx, iocb, res, res2);
+   if (ctx->flags & IOCTX_FLAG_SCQRING) {
+   struct io_event *ev;
+   unsigned int tail;
+
+   /* Can't fail, we have a ring reservation */
+   ev = aio_peek_cqr

[PATCH 25/26] aio: split old ring complete out from aio_complete()

2018-12-04 Thread Jens Axboe

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1c8a8bb631a9..39aaffd6d436 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1218,12 +1218,9 @@ static void aio_fill_event(struct io_event *ev, struct 
aio_kiocb *iocb,
ev->res2 = res2;
 }
 
-/* aio_complete
- * Called when the io request on the given iocb is complete.
- */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_ring_complete(struct kioctx *ctx, struct aio_kiocb *iocb,
+ long res, long res2)
 {
-   struct kioctx   *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
unsigned tail, pos, head;
@@ -1273,6 +1270,16 @@ static void aio_complete(struct aio_kiocb *iocb, long 
res, long res2)
spin_unlock_irqrestore(>completion_lock, flags);
 
pr_debug("added to ring %p at [%u]\n", iocb, tail);
+}
+
+/* aio_complete
+ * Called when the io request on the given iocb is complete.
+ */
+static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+{
+   struct kioctx *ctx = iocb->ki_ctx;
+
+   aio_ring_complete(ctx, iocb, res, res2);
 
/*
 * Check if the user asked us to deliver the result through an
-- 
2.17.1

[PATCH 23/26] fs: add support for mapping an ITER_BVEC for O_DIRECT

2018-12-04 Thread Jens Axboe

This adds support for sync/async O_DIRECT to make a bvec type iter
for bdev access, as well as iomap.

Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 16 
 fs/iomap.c |  5 -
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b8f574615792..236c6abe649d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -219,7 +219,10 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct 
iov_iter *iter,
bio.bi_end_io = blkdev_bio_end_io_simple;
bio.bi_ioprio = iocb->ki_ioprio;
 
-   ret = bio_iov_iter_get_pages(, iter);
+   if (iov_iter_is_bvec(iter))
+   ret = bio_iov_bvec_add_pages(, iter);
+   else
+   ret = bio_iov_iter_get_pages(, iter);
if (unlikely(ret))
goto out;
ret = bio.bi_iter.bi_size;
@@ -326,8 +329,9 @@ static void blkdev_bio_end_io(struct bio *bio)
struct bio_vec *bvec;
int i;
 
-   bio_for_each_segment_all(bvec, bio, i)
-   put_page(bvec->bv_page);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_for_each_segment_all(bvec, bio, i)
+   put_page(bvec->bv_page);
bio_put(bio);
}
 }
@@ -381,7 +385,11 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
 
-   ret = bio_iov_iter_get_pages(bio, iter);
+   if (iov_iter_is_bvec(iter))
+   ret = bio_iov_bvec_add_pages(bio, iter);
+   else
+   ret = bio_iov_iter_get_pages(bio, iter);
+
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
diff --git a/fs/iomap.c b/fs/iomap.c
index bd483fcb7b5a..f00e5e198c57 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1673,7 +1673,10 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
loff_t length,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
 
-   ret = bio_iov_iter_get_pages(bio, );
+   if (iov_iter_is_bvec())
+   ret = bio_iov_bvec_add_pages(bio, );
+   else
+   ret = bio_iov_iter_get_pages(bio, );
if (unlikely(ret)) {
/*
 * We have to stop part way through an IO. We must fall
-- 
2.17.1

[PATCH 24/26] aio: add support for pre-mapped user IO buffers

2018-12-04 Thread Jens Axboe

If we have fixed user buffers, we can map them into the kernel when we
setup the io_context. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must set both
IOCTX_FLAG_USERIOCB, to provide iocb's in userspace, and then
IOCTX_FLAG_FIXEDBUFS. The latter tells aio that the iocbs that are
mapped already contain valid destination and sizes. These buffers can
then be mapped into the kernel for the life time of the io_context, as
opposed to just the duration of the each single IO.

Only works with non-vectored read/write commands for now, not with
PREADV/PWRITEV.

A limit of 4M is imposed as the largest buffer we currently support.
There's nothing preventing us from going larger, but we need some cap,
and 4M seemed like it would definitely be big enough. RLIMIT_MEMLOCK
is used to cap the total amount of memory pinned.

See the fio change for how to utilize this feature:

http://git.kernel.dk/cgit/fio/commit/?id=2041bd343da1c1e955253f62374588718c64f0f3

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 193 ---
 include/uapi/linux/aio_abi.h |   1 +
 2 files changed, 177 insertions(+), 17 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1735ec556089..1c8a8bb631a9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -97,6 +98,11 @@ struct aio_mapped_range {
long nr_pages;
 };
 
+struct aio_mapped_ubuf {
+   struct bio_vec *bvec;
+   unsigned int nr_bvecs;
+};
+
 struct kioctx {
struct percpu_ref   users;
atomic_tdead;
@@ -132,6 +138,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
+   struct aio_mapped_ubuf  *user_bufs;
+
struct aio_mapped_range iocb_range;
 
struct rcu_work free_rwork; /* see free_ioctx() */
@@ -301,6 +309,7 @@ static const bool aio_use_state_req_list = false;
 
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
+static void aio_iocb_buffer_unmap(struct kioctx *);
 
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
@@ -662,6 +671,7 @@ static void free_ioctx(struct work_struct *work)
  free_rwork);
pr_debug("freeing %p\n", ctx);
 
+   aio_iocb_buffer_unmap(ctx);
aio_useriocb_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
@@ -1672,6 +1682,122 @@ static int aio_useriocb_map(struct kioctx *ctx, struct 
iocb __user *iocbs)
return aio_map_range(>iocb_range, iocbs, size, 0);
 }
 
+static void aio_iocb_buffer_unmap(struct kioctx *ctx)
+{
+   int i, j;
+
+   if (!ctx->user_bufs)
+   return;
+
+   for (i = 0; i < ctx->max_reqs; i++) {
+   struct aio_mapped_ubuf *amu = >user_bufs[i];
+
+   for (j = 0; j < amu->nr_bvecs; j++)
+   put_page(amu->bvec[j].bv_page);
+
+   kfree(amu->bvec);
+   amu->nr_bvecs = 0;
+   }
+
+   kfree(ctx->user_bufs);
+   ctx->user_bufs = NULL;
+}
+
+static int aio_iocb_buffer_map(struct kioctx *ctx)
+{
+   unsigned long total_pages, page_limit;
+   struct page **pages = NULL;
+   int i, j, got_pages = 0;
+   struct iocb *iocb;
+   int ret = -EINVAL;
+
+   ctx->user_bufs = kzalloc(ctx->max_reqs * sizeof(struct aio_mapped_ubuf),
+   GFP_KERNEL);
+   if (!ctx->user_bufs)
+   return -ENOMEM;
+
+   /* Don't allow more pages than we can safely lock */
+   total_pages = 0;
+   page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+   for (i = 0; i < ctx->max_reqs; i++) {
+   struct aio_mapped_ubuf *amu = >user_bufs[i];
+   unsigned long off, start, end, ubuf;
+   int pret, nr_pages;
+   size_t size;
+
+   iocb = aio_iocb_from_index(ctx, i);
+
+   /*
+* Don't impose further limits on the size and buffer
+* constraints here, we'll -EINVAL later when IO is
+* submitted if they are wrong.
+*/
+   ret = -EFAULT;
+   if (!iocb->aio_buf)
+   goto err;
+
+   /* arbitrary limit, but we need something */
+   if (iocb->aio_nbytes > SZ_4M)
+   goto err;
+
+   ubuf = iocb->aio_buf;
+   end = (ubuf + iocb->aio_nbytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   start = ubuf >> PAGE_SHIFT;
+   nr_pages = end - start;
+
+   ret = -ENOMEM;
+   if (total_pages + nr_pages > page_limit)
+   goto err;
+
+   if

[PATCH 21/26] block: add BIO_HOLD_PAGES flag

2018-12-04 Thread Jens Axboe

For user mapped IO, we do get_user_pages() upfront, and then do a
put_page() on each page at end_io time to release the page reference. In
preparation for having permanently mapped pages, add a BIO_HOLD_PAGES
flag that tells us not to release the pages, the caller will do that.

Signed-off-by: Jens Axboe 
---
 block/bio.c   | 6 --
 include/linux/blk_types.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 03895cc0d74a..ab174bce5436 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1635,7 +1635,8 @@ static void bio_dirty_fn(struct work_struct *work)
next = bio->bi_private;
 
bio_set_pages_dirty(bio);
-   bio_release_pages(bio);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_release_pages(bio);
bio_put(bio);
}
 }
@@ -1651,7 +1652,8 @@ void bio_check_pages_dirty(struct bio *bio)
goto defer;
}
 
-   bio_release_pages(bio);
+   if (!bio_flagged(bio, BIO_HOLD_PAGES))
+   bio_release_pages(bio);
bio_put(bio);
return;
 defer:
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1d9c3da0f2a1..344e5b61aa4e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -227,6 +227,7 @@ struct bio {
 #define BIO_TRACE_COMPLETION 10/* bio_endio() should trace the final 
completion
 * of this bio. */
 #define BIO_QUEUE_ENTERED 11   /* can use blk_queue_enter_live() */
+#define BIO_HOLD_PAGES 12  /* don't put O_DIRECT pages */
 
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
-- 
2.17.1

[PATCH 19/26] aio: split iocb init from allocation

2018-12-04 Thread Jens Axboe

In preparation from having pre-allocated requests, that we then just
need to initialize before use.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 634b540b0c92..416bb2e365e0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1099,6 +1099,16 @@ static bool get_reqs_available(struct kioctx *ctx)
return __get_reqs_available(ctx);
 }
 
+static void aio_iocb_init(struct kioctx *ctx, struct aio_kiocb *req)
+{
+   percpu_ref_get(>reqs);
+   req->ki_ctx = ctx;
+   INIT_LIST_HEAD(>ki_list);
+   req->ki_flags = 0;
+   refcount_set(>ki_refcnt, 0);
+   req->ki_eventfd = NULL;
+}
+
 /* aio_get_req
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -,12 +1121,7 @@ static inline struct aio_kiocb *aio_get_req(struct 
kioctx *ctx)
if (unlikely(!req))
return NULL;
 
-   percpu_ref_get(>reqs);
-   req->ki_ctx = ctx;
-   INIT_LIST_HEAD(>ki_list);
-   req->ki_flags = 0;
-   refcount_set(>ki_refcnt, 0);
-   req->ki_eventfd = NULL;
+   aio_iocb_init(ctx, req);
return req;
 }
 
-- 
2.17.1

[PATCH 04/26] block: use REQ_HIPRI_ASYNC for non-sync polled IO

2018-12-04 Thread Jens Axboe

Tell the block layer if it's a sync or async polled request, so it can
do the right thing.

Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6de8d35f6e41..b8f574615792 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -402,8 +402,12 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
 
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
if (!nr_pages) {
-   if (iocb->ki_flags & IOCB_HIPRI)
-   bio->bi_opf |= REQ_HIPRI;
+   if (iocb->ki_flags & IOCB_HIPRI) {
+   if (!is_sync)
+   bio->bi_opf |= REQ_HIPRI_ASYNC;
+   else
+   bio->bi_opf |= REQ_HIPRI;
+   }
 
qc = submit_bio(bio);
WRITE_ONCE(iocb->ki_cookie, qc);
-- 
2.17.1

[PATCH 15/26] aio: support for IO polling

2018-12-04 Thread Jens Axboe

Add polled variants of PREAD/PREADV and PWRITE/PWRITEV. These act
like their non-polled counterparts, except we expect to poll for
completion of them. The polling happens at io_getevent() time, and
works just like non-polled IO.

To setup an io_context for polled IO, the application must call
io_setup2() with IOCTX_FLAG_IOPOLL as one of the flags. It is illegal
to mix and match polled and non-polled IO on an io_context.

Polled IO doesn't support the user mapped completion ring. Events
must be reaped through the io_getevents() system call. For non-irq
driven poll devices, there's no way to support completion reaping
from userspace by just looking at the ring. The application itself
is the one that pulls completion entries.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 393 +++
 include/uapi/linux/aio_abi.h |   3 +
 2 files changed, 360 insertions(+), 36 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index bb6f07ca6940..5d34317c2929 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -153,6 +153,18 @@ struct kioctx {
atomic_treqs_available;
} cacheline_aligned_in_smp;
 
+   /* iopoll submission state */
+   struct {
+   spinlock_t poll_lock;
+   struct list_head poll_submitted;
+   } cacheline_aligned_in_smp;
+
+   /* iopoll completion state */
+   struct {
+   struct list_head poll_completing;
+   struct mutex getevents_lock;
+   } cacheline_aligned_in_smp;
+
struct {
spinlock_t  ctx_lock;
struct list_head active_reqs;   /* used for cancellation */
@@ -205,14 +217,27 @@ struct aio_kiocb {
__u64   ki_user_data;   /* user's data for completion */
 
struct list_headki_list;/* the aio core uses this
-* for cancellation */
+* for cancellation, or for
+* polled IO */
+
+   unsigned long   ki_flags;
+#define IOCB_POLL_COMPLETED0   /* polled IO has completed */
+#define IOCB_POLL_EAGAIN   1   /* polled submission got EAGAIN */
+
refcount_t  ki_refcnt;
 
-   /*
-* If the aio_resfd field of the userspace iocb is not zero,
-* this is the underlying eventfd context to deliver events to.
-*/
-   struct eventfd_ctx  *ki_eventfd;
+   union {
+   /*
+* If the aio_resfd field of the userspace iocb is not zero,
+* this is the underlying eventfd context to deliver events to.
+*/
+   struct eventfd_ctx  *ki_eventfd;
+
+   /*
+* For polled IO, stash completion info here
+*/
+   struct io_event ki_ev;
+   };
 };
 
 /*-- sysctl variables*/
@@ -233,6 +258,7 @@ static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
 
 static void aio_useriocb_unmap(struct kioctx *);
+static void aio_iopoll_reap_events(struct kioctx *);
 
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
@@ -471,11 +497,15 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned 
int nr_events)
int i;
struct file *file;
 
-   /* Compensate for the ring buffer's head/tail overlap entry */
-   nr_events += 2; /* 1 is required, 2 for good luck */
-
+   /*
+* Compensate for the ring buffer's head/tail overlap entry.
+* IO polling doesn't require any io event entries
+*/
size = sizeof(struct aio_ring);
-   size += sizeof(struct io_event) * nr_events;
+   if (!(ctx->flags & IOCTX_FLAG_IOPOLL)) {
+   nr_events += 2; /* 1 is required, 2 for good luck */
+   size += sizeof(struct io_event) * nr_events;
+   }
 
nr_pages = PFN_UP(size);
if (nr_pages < 0)
@@ -758,6 +788,11 @@ static struct kioctx *io_setup_flags(unsigned long ctxid,
 
INIT_LIST_HEAD(>active_reqs);
 
+   spin_lock_init(>poll_lock);
+   INIT_LIST_HEAD(>poll_submitted);
+   INIT_LIST_HEAD(>poll_completing);
+   mutex_init(>getevents_lock);
+
if (percpu_ref_init(>users, free_ioctx_users, 0, GFP_KERNEL))
goto err;
 
@@ -829,11 +864,15 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
 {
struct kioctx_table *table;
 
+   mutex_lock(>getevents_lock);
spin_lock(>ioctx_lock);
if (atomic_xchg(>dead, 1)) {
spin_unlock(>ioctx_lock);
+   mutex_unlock(>getevents_lock);
return -EINVAL;
}
+   aio_iopoll_reap_events(ctx);
+   mutex_unlock(>getevents_lock);
 
table = rcu_dereference_raw(mm->ioctx_table)

[PATCH 03/26] block: wire up block device iopoll method

2018-12-04 Thread Jens Axboe

From: Christoph Hellwig 

Just call blk_poll on the iocb cookie, we can derive the block device
from the inode trivially.

Reviewed-by: Johannes Thumshirn 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 fs/block_dev.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e1886cc7048f..6de8d35f6e41 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -281,6 +281,14 @@ struct blkdev_dio {
 
 static struct bio_set blkdev_dio_pool;
 
+static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
+{
+   struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
+   struct request_queue *q = bdev_get_queue(bdev);
+
+   return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
+}
+
 static void blkdev_bio_end_io(struct bio *bio)
 {
struct blkdev_dio *dio = bio->bi_private;
@@ -398,6 +406,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
*iter, int nr_pages)
bio->bi_opf |= REQ_HIPRI;
 
qc = submit_bio(bio);
+   WRITE_ONCE(iocb->ki_cookie, qc);
break;
}
 
@@ -2070,6 +2079,7 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read_iter  = blkdev_read_iter,
.write_iter = blkdev_write_iter,
+   .iopoll = blkdev_iopoll,
.mmap   = generic_file_mmap,
.fsync  = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
-- 
2.17.1

[PATCH 02/26] block: add REQ_HIPRI_ASYNC

2018-12-04 Thread Jens Axboe

For the upcoming async polled IO, we can't sleep allocating requests.
If we do, then we introduce a deadlock where the submitter already
has async polled IO in-flight, but can't wait for them to complete
since polled requests must be active found and reaped.

Signed-off-by: Jens Axboe 
---
 include/linux/blk_types.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c0ba1a038ff3..1d9c3da0f2a1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -346,6 +346,7 @@ enum req_flag_bits {
 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
 #define REQ_NOUNMAP(1ULL << __REQ_NOUNMAP)
 #define REQ_HIPRI  (1ULL << __REQ_HIPRI)
+#define REQ_HIPRI_ASYNC(REQ_HIPRI | REQ_NOWAIT)
 
 #define REQ_DRV(1ULL << __REQ_DRV)
 #define REQ_SWAP   (1ULL << __REQ_SWAP)
-- 
2.17.1

[PATCH 20/26] aio: batch aio_kiocb allocation

2018-12-04 Thread Jens Axboe

Similarly to how we use the state->ios_left to know how many references
to get to a file, we can use it to allocate the aio_kiocb's we need in
bulk.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 47 +++
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 416bb2e365e0..1735ec556089 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -240,6 +240,8 @@ struct aio_kiocb {
};
 };
 
+#define AIO_IOPOLL_BATCH   8
+
 struct aio_submit_state {
struct kioctx *ctx;
 
@@ -254,6 +256,13 @@ struct aio_submit_state {
struct list_head req_list;
unsigned int req_count;
 
+   /*
+* aio_kiocb alloc cache
+*/
+   void *iocbs[AIO_IOPOLL_BATCH];
+   unsigned int free_iocbs;
+   unsigned int cur_iocb;
+
/*
 * File reference cache
 */
@@ -1113,15 +1122,35 @@ static void aio_iocb_init(struct kioctx *ctx, struct 
aio_kiocb *req)
  * Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
  */
-static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
+static struct aio_kiocb *aio_get_req(struct kioctx *ctx,
+struct aio_submit_state *state)
 {
struct aio_kiocb *req;
 
-   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
-   if (unlikely(!req))
-   return NULL;
+   if (!state)
+   req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+   else if (!state->free_iocbs) {
+   size_t size;
+
+   size = min_t(size_t, state->ios_left, ARRAY_SIZE(state->iocbs));
+   size = kmem_cache_alloc_bulk(kiocb_cachep, GFP_KERNEL, size,
+   state->iocbs);
+   if (size < 0)
+   return ERR_PTR(size);
+   else if (!size)
+   return ERR_PTR(-ENOMEM);
+   state->free_iocbs = size - 1;
+   state->cur_iocb = 1;
+   req = state->iocbs[0];
+   } else {
+   req = state->iocbs[state->cur_iocb];
+   state->free_iocbs--;
+   state->cur_iocb++;
+   }
+
+   if (req)
+   aio_iocb_init(ctx, req);
 
-   aio_iocb_init(ctx, req);
return req;
 }
 
@@ -1359,8 +1388,6 @@ static bool aio_read_events(struct kioctx *ctx, long 
min_nr, long nr,
return ret < 0 || *i >= min_nr;
 }
 
-#define AIO_IOPOLL_BATCH   8
-
 /*
  * Process completed iocb iopoll entries, copying the result to userspace.
  */
@@ -2357,7 +2384,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
return -EAGAIN;
 
ret = -EAGAIN;
-   req = aio_get_req(ctx);
+   req = aio_get_req(ctx, state);
if (unlikely(!req))
goto out_put_reqs_available;
 
@@ -2489,6 +2516,9 @@ static void aio_submit_state_end(struct aio_submit_state 
*state)
if (!list_empty(>req_list))
aio_flush_state_reqs(state->ctx, state);
aio_file_put(state);
+   if (state->free_iocbs)
+   kmem_cache_free_bulk(kiocb_cachep, state->free_iocbs,
+   >iocbs[state->cur_iocb]);
 }
 
 /*
@@ -2500,6 +2530,7 @@ static void aio_submit_state_start(struct 
aio_submit_state *state,
state->ctx = ctx;
INIT_LIST_HEAD(>req_list);
state->req_count = 0;
+   state->free_iocbs = 0;
state->file = NULL;
state->ios_left = max_ios;
 #ifdef CONFIG_BLOCK
-- 
2.17.1

[PATCH 13/26] aio: add io_setup2() system call

2018-12-04 Thread Jens Axboe

This is just like io_setup(), except add a flags argument to let the
caller control/define some of the io_context behavior.

Outside of the flags, we add an iocb array and two user pointers for
future use.

Signed-off-by: Jens Axboe 
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/aio.c   | 69 --
 include/linux/syscalls.h   |  3 ++
 include/uapi/asm-generic/unistd.h  |  4 +-
 kernel/sys_ni.c|  1 +
 5 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..67c357225fb0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@
 332common  statx   __x64_sys_statx
 333common  io_pgetevents   __x64_sys_io_pgetevents
 334common  rseq__x64_sys_rseq
+335common  io_setup2   __x64_sys_io_setup2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index 173f1f79dc8f..26631d6872d2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -100,6 +100,8 @@ struct kioctx {
 
unsigned long   user_id;
 
+   unsigned intflags;
+
struct __percpu kioctx_cpu *cpu;
 
/*
@@ -686,10 +688,8 @@ static void aio_nr_sub(unsigned nr)
spin_unlock(_nr_lock);
 }
 
-/* ioctx_alloc
- * Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
- */
-static struct kioctx *ioctx_alloc(unsigned nr_events)
+static struct kioctx *io_setup_flags(unsigned long ctxid,
+unsigned int nr_events, unsigned int flags)
 {
struct mm_struct *mm = current->mm;
struct kioctx *ctx;
@@ -701,6 +701,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 */
unsigned int max_reqs = nr_events;
 
+   if (unlikely(ctxid || nr_events == 0)) {
+   pr_debug("EINVAL: ctx %lu nr_events %u\n",
+ctxid, nr_events);
+   return ERR_PTR(-EINVAL);
+   }
+
/*
 * We keep track of the number of available ringbuffer slots, to prevent
 * overflow (reqs_available), and we also use percpu counters for this.
@@ -726,6 +732,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
if (!ctx)
return ERR_PTR(-ENOMEM);
 
+   ctx->flags = flags;
ctx->max_reqs = max_reqs;
 
spin_lock_init(>ctx_lock);
@@ -1281,6 +1288,34 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
return ret;
 }
 
+SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, struct iocb __user *,
+   iocbs, void __user *, user1, void __user *, user2,
+   aio_context_t __user *, ctxp)
+{
+   struct kioctx *ioctx;
+   unsigned long ctx;
+   long ret;
+
+   if (flags || user1 || user2)
+   return -EINVAL;
+
+   ret = get_user(ctx, ctxp);
+   if (unlikely(ret))
+   goto out;
+
+   ioctx = io_setup_flags(ctx, nr_events, flags);
+   ret = PTR_ERR(ioctx);
+   if (IS_ERR(ioctx))
+   goto out;
+
+   ret = put_user(ioctx->user_id, ctxp);
+   if (ret)
+   kill_ioctx(current->mm, ioctx, NULL);
+   percpu_ref_put(>users);
+out:
+   return ret;
+}
+
 /* sys_io_setup:
  * Create an aio_context capable of receiving at least nr_events.
  * ctxp must not point to an aio_context that already exists, and
@@ -1296,7 +1331,7 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
  */
 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
-   struct kioctx *ioctx = NULL;
+   struct kioctx *ioctx;
unsigned long ctx;
long ret;
 
@@ -1304,14 +1339,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
if (unlikely(ret))
goto out;
 
-   ret = -EINVAL;
-   if (unlikely(ctx || nr_events == 0)) {
-   pr_debug("EINVAL: ctx %lu nr_events %u\n",
-ctx, nr_events);
-   goto out;
-   }
-
-   ioctx = ioctx_alloc(nr_events);
+   ioctx = io_setup_flags(ctx, nr_events, 0);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
@@ -1327,7 +1355,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
 {
-   struct kioctx *ioctx = NULL;
+   struct kioctx *ioctx;
unsigned long ctx;
long ret;
 
@@ -1335,23 +1363,14 @@ COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
u32 __user *, ctx32p)
if (unlikely(ret))
goto out;
 
-

[PATCH 11/26] aio: split out iocb copy from io_submit_one()

2018-12-04 Thread Jens Axboe

In preparation of handing in iocbs in a different fashion as well. Also
make it clear that the iocb being passed in isn't modified, by marking
it const throughout.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 68 +++-
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index cf93b92bfb1e..06c8bcc72496 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1420,7 +1420,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long 
res, long res2)
aio_complete(iocb, res, res2);
 }
 
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 {
int ret;
 
@@ -1461,7 +1461,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb 
*iocb)
return ret;
 }
 
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
+static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
bool vectored, bool compat, struct iov_iter *iter)
 {
void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
@@ -1500,8 +1500,8 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t 
ret)
}
 }
 
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
-   bool compat)
+static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+   bool vectored, bool compat)
 {
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1533,8 +1533,8 @@ static ssize_t aio_read(struct kiocb *req, struct iocb 
*iocb, bool vectored,
return ret;
 }
 
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
-   bool compat)
+static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+bool vectored, bool compat)
 {
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
@@ -1589,7 +1589,8 @@ static void aio_fsync_work(struct work_struct *work)
aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
 }
 
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+bool datasync)
 {
if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
iocb->aio_rw_flags))
@@ -1717,7 +1718,7 @@ aio_poll_queue_proc(struct file *file, struct 
wait_queue_head *head,
add_wait_queue(head, >iocb->poll.wait);
 }
 
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 {
struct kioctx *ctx = aiocb->ki_ctx;
struct poll_iocb *req = >poll;
@@ -1789,27 +1790,23 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct 
iocb *iocb)
return 0;
 }
 
-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-bool compat)
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+  struct iocb __user *user_iocb, bool compat)
 {
struct aio_kiocb *req;
-   struct iocb iocb;
ssize_t ret;
 
-   if (unlikely(copy_from_user(, user_iocb, sizeof(iocb
-   return -EFAULT;
-
/* enforce forwards compatibility on users */
-   if (unlikely(iocb.aio_reserved2)) {
+   if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
 
/* prevent overflows */
if (unlikely(
-   (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
-   (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
-   ((ssize_t)iocb.aio_nbytes < 0)
+   (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+   (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+   ((ssize_t)iocb->aio_nbytes < 0)
   )) {
pr_debug("EINVAL: overflow check\n");
return -EINVAL;
@@ -1823,14 +1820,14 @@ static int io_submit_one(struct kioctx *ctx, struct 
iocb __user *user_iocb,
if (unlikely(!req))
goto out_put_reqs_available;
 
-   if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+   if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
 * instance of the file* now. The file descriptor must be
 * an eventfd() fd, and will be signaled for each completed
 * event using the eventfd_signal() function.
 */
-   req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
+   req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);

[PATCH 14/26] aio: add support for having user mapped iocbs

2018-12-04 Thread Jens Axboe

For io_submit(), we have to first copy each pointer to an iocb, then
copy the iocb. The latter is 64 bytes in size, and that's a lot of
copying for a single IO.

Add support for setting IOCTX_FLAG_USERIOCB through the new io_setup2()
system call, which allows the iocbs to reside in userspace. If this flag
is used, then io_submit() doesn't take pointers to iocbs anymore, it
takes an index value into the array of iocbs instead. Similary, for
io_getevents(), the iocb ->obj will be the index, not the pointer to the
iocb.

See the change made to fio to support this feature, it's pretty
trivialy to adapt to. For applications, like fio, that previously
embedded the iocb inside a application private structure, some sort
of lookup table/structure is needed to find the private IO structure
from the index at io_getevents() time.

http://git.kernel.dk/cgit/fio/commit/?id=3c3168e91329c83880c91e5abc28b9d6b940fd95

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 126 +++
 include/uapi/linux/aio_abi.h |   2 +
 2 files changed, 116 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 26631d6872d2..bb6f07ca6940 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -92,6 +92,11 @@ struct ctx_rq_wait {
atomic_t count;
 };
 
+struct aio_mapped_range {
+   struct page **pages;
+   long nr_pages;
+};
+
 struct kioctx {
struct percpu_ref   users;
atomic_tdead;
@@ -127,6 +132,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
+   struct aio_mapped_range iocb_range;
+
struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
@@ -222,6 +229,11 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
+static const unsigned int iocb_page_shift =
+   ilog2(PAGE_SIZE / sizeof(struct iocb));
+
+static void aio_useriocb_unmap(struct kioctx *);
+
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
struct file *file;
@@ -578,6 +590,7 @@ static void free_ioctx(struct work_struct *work)
  free_rwork);
pr_debug("freeing %p\n", ctx);
 
+   aio_useriocb_unmap(ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(>reqs);
@@ -1288,6 +1301,70 @@ static long read_events(struct kioctx *ctx, long min_nr, 
long nr,
return ret;
 }
 
+static struct iocb *aio_iocb_from_index(struct kioctx *ctx, int index)
+{
+   struct iocb *iocb;
+
+   iocb = page_address(ctx->iocb_range.pages[index >> iocb_page_shift]);
+   index &= ((1 << iocb_page_shift) - 1);
+   return iocb + index;
+}
+
+static void aio_unmap_range(struct aio_mapped_range *range)
+{
+   int i;
+
+   if (!range->nr_pages)
+   return;
+
+   for (i = 0; i < range->nr_pages; i++)
+   put_page(range->pages[i]);
+
+   kfree(range->pages);
+   range->pages = NULL;
+   range->nr_pages = 0;
+}
+
+static int aio_map_range(struct aio_mapped_range *range, void __user *uaddr,
+size_t size, int gup_flags)
+{
+   int nr_pages, ret;
+
+   if ((unsigned long) uaddr & ~PAGE_MASK)
+   return -EINVAL;
+
+   nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+   range->pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
+   if (!range->pages)
+   return -ENOMEM;
+
+   down_write(>mm->mmap_sem);
+   ret = get_user_pages((unsigned long) uaddr, nr_pages, gup_flags,
+   range->pages, NULL);
+   up_write(>mm->mmap_sem);
+
+   if (ret < nr_pages) {
+   kfree(range->pages);
+   return -ENOMEM;
+   }
+
+   range->nr_pages = nr_pages;
+   return 0;
+}
+
+static void aio_useriocb_unmap(struct kioctx *ctx)
+{
+   aio_unmap_range(>iocb_range);
+}
+
+static int aio_useriocb_map(struct kioctx *ctx, struct iocb __user *iocbs)
+{
+   size_t size = sizeof(struct iocb) * ctx->max_reqs;
+
+   return aio_map_range(>iocb_range, iocbs, size, 0);
+}
+
 SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, struct iocb __user *,
iocbs, void __user *, user1, void __user *, user2,
aio_context_t __user *, ctxp)
@@ -1296,7 +1373,9 @@ SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, 
struct iocb __user *,
unsigned long ctx;
long ret;
 
-   if (flags || user1 || user2)
+   if (user1 || user2)
+   return -EINVAL;
+   if (flags & ~IOCTX_FLAG_USERIOCB)
return -EINVAL;
 
ret = get_user(ctx, ctxp);
@@ -1308,9 +1387,17 @@ SYSCALL_DEFINE6(io_setup2, u32, nr_eve

[PATCH 16/26] aio: add submission side request cache

2018-12-04 Thread Jens Axboe

We have to add each submitted polled request to the io_context
poll_submitted list, which means we have to grab the poll_lock. We
already use the block plug to batch submissions if we're doing a batch
of IO submissions, extend that to cover the poll requests internally as
well.

Signed-off-by: Jens Axboe 
---
 fs/aio.c | 136 +--
 1 file changed, 113 insertions(+), 23 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 5d34317c2929..ae0805dc814c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -240,6 +240,21 @@ struct aio_kiocb {
};
 };
 
+struct aio_submit_state {
+   struct kioctx *ctx;
+
+   struct blk_plug plug;
+#ifdef CONFIG_BLOCK
+   struct blk_plug_cb plug_cb;
+#endif
+
+   /*
+* Polled iocbs that have been submitted, but not added to the ctx yet
+*/
+   struct list_head req_list;
+   unsigned int req_count;
+};
+
 /*-- sysctl variables*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;  /* current system wide number of aio requests */
@@ -257,6 +272,15 @@ static const struct address_space_operations aio_ctx_aops;
 static const unsigned int iocb_page_shift =
ilog2(PAGE_SIZE / sizeof(struct iocb));
 
+/*
+ * We rely on block level unplugs to flush pending requests, if we schedule
+ */
+#ifdef CONFIG_BLOCK
+static const bool aio_use_state_req_list = true;
+#else
+static const bool aio_use_state_req_list = false;
+#endif
+
 static void aio_useriocb_unmap(struct kioctx *);
 static void aio_iopoll_reap_events(struct kioctx *);
 
@@ -1887,13 +1911,28 @@ static inline void aio_rw_done(struct kiocb *req, 
ssize_t ret)
}
 }
 
+/*
+ * Called either at the end of IO submission, or through a plug callback
+ * because we're going to schedule. Moves out local batch of requests to
+ * the ctx poll list, so they can be found for polling + reaping.
+ */
+static void aio_flush_state_reqs(struct kioctx *ctx,
+struct aio_submit_state *state)
+{
+   spin_lock(>poll_lock);
+   list_splice_tail_init(>req_list, >poll_submitted);
+   spin_unlock(>poll_lock);
+   state->req_count = 0;
+}
+
 /*
  * After the iocb has been issued, it's safe to be found on the poll list.
  * Adding the kiocb to the list AFTER submission ensures that we don't
  * find it from a io_getevents() thread before the issuer is done accessing
  * the kiocb cookie.
  */
-static void aio_iopoll_iocb_issued(struct aio_kiocb *kiocb)
+static void aio_iopoll_iocb_issued(struct aio_submit_state *state,
+  struct aio_kiocb *kiocb)
 {
/*
 * For fast devices, IO may have already completed. If it has, add
@@ -1903,12 +1942,21 @@ static void aio_iopoll_iocb_issued(struct aio_kiocb 
*kiocb)
const int front_add = test_bit(IOCB_POLL_COMPLETED, >ki_flags);
struct kioctx *ctx = kiocb->ki_ctx;
 
-   spin_lock(>poll_lock);
-   if (front_add)
-   list_add(>ki_list, >poll_submitted);
-   else
-   list_add_tail(>ki_list, >poll_submitted);
-   spin_unlock(>poll_lock);
+   if (!state || !aio_use_state_req_list) {
+   spin_lock(>poll_lock);
+   if (front_add)
+   list_add(>ki_list, >poll_submitted);
+   else
+   list_add_tail(>ki_list, >poll_submitted);
+   spin_unlock(>poll_lock);
+   } else {
+   if (front_add)
+   list_add(>ki_list, >req_list);
+   else
+   list_add_tail(>ki_list, >req_list);
+   if (++state->req_count >= AIO_IOPOLL_BATCH)
+   aio_flush_state_reqs(ctx, state);
+   }
 }
 
 static ssize_t aio_read(struct aio_kiocb *kiocb, const struct iocb *iocb,
@@ -2204,7 +2252,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const 
struct iocb *iocb)
 }
 
 static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
-  struct iocb __user *user_iocb, bool compat)
+  struct iocb __user *user_iocb,
+  struct aio_submit_state *state, bool compat)
 {
struct aio_kiocb *req;
ssize_t ret;
@@ -2308,7 +2357,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
ret = -EAGAIN;
goto out_put_req;
}
-   aio_iopoll_iocb_issued(req);
+   aio_iopoll_iocb_issued(state, req);
}
return 0;
 out_put_req:
@@ -2322,7 +2371,7 @@ static int __io_submit_one(struct kioctx *ctx, const 
struct iocb *iocb,
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-bool compat)
+struct aio_submit_state *state, bool compat)
 {

[PATCH 01/26] fs: add an iopoll method to struct file_operations

2018-12-04 Thread Jens Axboe

From: Christoph Hellwig 

This new methods is used to explicitly poll for I/O completion for an
iocb.  It must be called for any iocb submitted asynchronously (that
is with a non-null ki_complete) which has the IOCB_HIPRI flag set.

The method is assisted by a new ki_cookie field in struct iocb to store
the polling cookie.

TODO: we can probably union ki_cookie with the existing hint and I/O
priority fields to avoid struct kiocb growth.

Reviewed-by: Johannes Thumshirn 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
---
 Documentation/filesystems/vfs.txt | 3 +++
 include/linux/fs.h| 2 ++
 2 files changed, 5 insertions(+)

diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 5f71a252e2e0..d9dc5e4d82b9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+   int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
@@ -902,6 +903,8 @@ otherwise noted.
 
   write_iter: possibly asynchronous write with iov_iter as source
 
+  iopoll: called when aio wants to poll for completions on HIPRI iocbs
+
   iterate: called when the VFS needs to read the directory contents
 
   iterate_shared: called when the VFS needs to read the directory contents
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a1ab233e6469..6a5f71f8ae06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -310,6 +310,7 @@ struct kiocb {
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
+   unsigned intki_cookie; /* for ->iopoll */
 } __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -1781,6 +1782,7 @@ struct file_operations {
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
+   int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
-- 
2.17.1

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2703 matches

Mail list logo