> index 293733f61594..9ef9987b4192 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -29,7 +29,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
> obj-$(CONFIG_TIMERFD) += timerfd.o
> obj-$(CONFIG_EVENTFD) += eventfd.o
> obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
> -obj-$(CONFIG_AIO) += aio.o
> +obj-$(CONFIG_AIO) += aio.o io_uring.o
It is probablt worth adding a new config symbol for the uring as no
code is shared with aio.
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> new file mode 100644
> index 000000000000..ae2b886282bb
> --- /dev/null
> +++ b/fs/io_uring.c
> @@ -0,0 +1,849 @@
> +/*
> + * Shared application/kernel submission and completion ring pairs, for
> + * supporting fast/efficient IO.
> + *
> + * Copyright (C) 2019 Jens Axboe
> + */
Add an SPDX header to all new files, please.
> +struct io_sq_ring {
> + struct io_uring r;
> + u32 ring_mask;
> + u32 ring_entries;
> + u32 dropped;
> + u32 flags;
> + u32 array[0];
> +};
field[0] is a legacy gcc extension, the proper C99+ way is field[].
> +
> +struct io_iocb_ring {
> + struct io_sq_ring *ring;
> + unsigned entries;
> + unsigned ring_mask;
> + struct io_uring_iocb *iocbs;
> +};
> +
> +struct io_event_ring {
> + struct io_cq_ring *ring;
> + unsigned entries;
> + unsigned ring_mask;
> +};
Btw, do we really need there structures? It would seem simpler
to just embedd them into the containing structure as:
struct io_sq_ring *sq_ring;
unsigned sq_ring_entries;
unsigned sq_ring_mask;
struct io_uring_iocb *sq_ring_iocbs;
struct io_cq_ring *cq_ring;
unsigned cq_ring_entries;
unsigned cq_ring_mask;
> +struct io_ring_ctx {
> + struct percpu_ref refs;
> +
> + unsigned int flags;
> + unsigned int max_reqs;
max_reqs can probably go away in favour of the sq ring nr_entries
field.
> + struct io_iocb_ring sq_ring;
> + struct io_event_ring cq_ring;
> +
> + struct work_struct work;
> +
> + struct {
> + struct mutex uring_lock;
> + } ____cacheline_aligned_in_smp;
> +
> + struct {
> + struct mutex ring_lock;
> + wait_queue_head_t wait;
> + } ____cacheline_aligned_in_smp;
> +
> + struct {
> + spinlock_t completion_lock;
> + } ____cacheline_aligned_in_smp;
> +};
Can you take a deep look if we need to keep all of ring_lock,
completion_lock and the later added poll locking? From a quick look
is isn't entirely clear what the locking strategy on the completion
side is. It needs to be documented and can hopefully be simplified.
> +struct fsync_iocb {
> + struct work_struct work;
> + struct file *file;
> + bool datasync;
> +};
Do we actually need this? Can't we just reuse the later thread
offload for fsync? Maybe just add fsync support once everything else
is done to make that simpler.
> +static const struct file_operations io_scqring_fops;
> +
> +static void io_ring_ctx_free(struct work_struct *work);
> +static void io_ring_ctx_ref_free(struct percpu_ref *ref);
Can you try to avoid to need the forward delcaration? (except for the
fops, where we probably need it).
>
> +
> +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> +{
> + struct io_ring_ctx *ctx;
> +
> + ctx = kmem_cache_zalloc(ioctx_cachep, GFP_KERNEL);
> + if (!ctx)
> + return NULL;
Do we really need an explicit slab for the contexts?
> +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx)
Maybe replace the req name with something matching the structure
name? (and more on the structure name later).
> +{
> + struct io_kiocb *req;
> +
> + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
> + if (!req)
> + return NULL;
> +
> + percpu_ref_get(&ctx->refs);
> + req->ki_ctx = ctx;
> + INIT_LIST_HEAD(&req->ki_list);
We never do a list_empty ceck on ki_list, so there should be no need
to initialize it.
> +static void io_fill_event(struct io_uring_event *ev, struct io_kiocb *kiocb,
> + long res, unsigned flags)
> +{
> + ev->index = kiocb->ki_index;
> + ev->res = res;
> + ev->flags = flags;
> +}
Probably no need for this helper.
> +static void io_complete_scqring(struct io_kiocb *iocb, long res, unsigned
> flags)
> +{
> + io_cqring_fill_event(iocb, res, flags);
> + io_complete_iocb(iocb->ki_ctx, iocb);
> +}
Probably no need for this helper either.
> + ret = kiocb_set_rw_flags(req, iocb->rw_flags);
> + if (unlikely(ret))
> + goto out_fput;
> +
> + /* no one is going to poll for this I/O */
> + req->ki_flags &= ~IOCB_HIPRI;
Now that we don't have the aio legacy to deal with should we just
reject IOCB_HIPRI on a non-polled context?
> +static int io_setup_rw(int rw, const struct io_uring_iocb *iocb,
> + struct iovec **iovec, struct iov_iter *iter)
> +{
> + void __user *buf = (void __user *)(uintptr_t)iocb->addr;
> + size_t ret;
> +
> + ret = import_single_range(rw, buf, iocb->len, *iovec, iter);
> + *iovec = NULL;
> + return ret;
> +}
Is there any point in supporting non-vectored operations here?
> + if (S_ISREG(file_inode(file)->i_mode)) {
> + __sb_start_write(file_inode(file)->i_sb,
> SB_FREEZE_WRITE, true);
> + __sb_writers_release(file_inode(file)->i_sb,
> SB_FREEZE_WRITE);
> + }
Overly long lines.
> +static int __io_submit_one(struct io_ring_ctx *ctx,
> + const struct io_uring_iocb *iocb,
> + unsigned long ki_index)
Maybe calls this io_ring_submit_one? Or generally find a nice prefix
for all the functions in this file?
> + f = fdget(fd);
> + if (f.file) {
> + struct io_ring_ctx *ctx;
Please just return early on fialure instead of forcing another level
of indentation.
> +
> + ctx->sq_ring.iocbs = io_mem_alloc(sizeof(struct io_uring_iocb) *
> + p->sq_entries);
Use array_size().
> +/*
> + * sys_io_uring_setup:
> + * Sets up an aio uring context, and returns the fd. Applications asks
> + * for a ring size, we return the actual sq/cq ring sizes (among other
> + * things) in the params structure passed in.
> + */
Can we drop this odd aio-style comment format? In fact the syscall
documentation probably just belongs into the man page only anyway.
Same for the uring_enter syscall.
> +struct io_uring_iocb {
Should we just call this io_uring_sqe?
> +/*
> + * IO completion data structure
> + */
> +struct io_uring_event {
> + __u64 index; /* what iocb this event came from */
> + __s32 res; /* result code for this event */
> + __u32 flags;
> +};
io_uring_cqe?