On Fri, 2018-12-07 at 15:20 -0700, Jens Axboe wrote:
> This is just like io_setup(), except add a flags argument to let the
> caller control/define some of the io_context behavior.
>
> Outside of the flags, we add an iocb array and two user pointers for
> future use.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/aio.c | 69 ++++++++++++++++----------
> include/linux/syscalls.h | 3 ++
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 5 files changed, 52 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
> b/arch/x86/entry/syscalls/syscall_64.tbl
> index f0b1709a5ffb..67c357225fb0 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -343,6 +343,7 @@
> 332 common statx __x64_sys_statx
> 333 common io_pgetevents __x64_sys_io_pgetevents
> 334 common rseq __x64_sys_rseq
> +335 common io_setup2 __x64_sys_io_setup2
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/aio.c b/fs/aio.c
> index 173f1f79dc8f..26631d6872d2 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -100,6 +100,8 @@ struct kioctx {
>
> unsigned long user_id;
>
> + unsigned int flags;
> +
> struct __percpu kioctx_cpu *cpu;
>
> /*
> @@ -686,10 +688,8 @@ static void aio_nr_sub(unsigned nr)
> spin_unlock(&aio_nr_lock);
> }
>
> -/* ioctx_alloc
> - * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
> - */
> -static struct kioctx *ioctx_alloc(unsigned nr_events)
> +static struct kioctx *io_setup_flags(unsigned long ctxid,
> + unsigned int nr_events, unsigned int flags)
> {
> struct mm_struct *mm = current->mm;
> struct kioctx *ctx;
> @@ -701,6 +701,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> */
> unsigned int max_reqs = nr_events;
>
> + if (unlikely(ctxid || nr_events == 0)) {
> + pr_debug("EINVAL: ctx %lu nr_events %u\n",
> + ctxid, nr_events);
> + return ERR_PTR(-EINVAL);
> + }
> +
> /*
> * We keep track of the number of available ringbuffer slots, to prevent
> * overflow (reqs_available), and we also use percpu counters for this.
> @@ -726,6 +732,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
> if (!ctx)
> return ERR_PTR(-ENOMEM);
>
> + ctx->flags = flags;
> ctx->max_reqs = max_reqs;
>
> spin_lock_init(&ctx->ctx_lock);
> @@ -1281,6 +1288,34 @@ static long read_events(struct kioctx *ctx, long
> min_nr, long nr,
> return ret;
> }
>
How about adding a comment similar to io_setup's below?
And would you like to mention also io_setup2 in
Documentation/sysctl/fs.txt?
> +SYSCALL_DEFINE6(io_setup2, u32, nr_events, u32, flags, struct iocb __user *,
> + iocbs, void __user *, user1, void __user *, user2,
> + aio_context_t __user *, ctxp)
> +{
> + struct kioctx *ioctx;
> + unsigned long ctx;
> + long ret;
> +
> + if (flags || user1 || user2)
> + return -EINVAL;
> +
> + ret = get_user(ctx, ctxp);
> + if (unlikely(ret))
> + goto out;
> +
> + ioctx = io_setup_flags(ctx, nr_events, flags);
> + ret = PTR_ERR(ioctx);
> + if (IS_ERR(ioctx))
> + goto out;
> +
> + ret = put_user(ioctx->user_id, ctxp);
> + if (ret)
> + kill_ioctx(current->mm, ioctx, NULL);
> + percpu_ref_put(&ioctx->users);
> +out:
> + return ret;
> +}
> +
> /* sys_io_setup:
> * Create an aio_context capable of receiving at least nr_events.
> * ctxp must not point to an aio_context that already exists, and
> @@ -1296,7 +1331,7 @@ static long read_events(struct kioctx *ctx, long
> min_nr, long nr,
> */
> SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
> {
> - struct kioctx *ioctx = NULL;
> + struct kioctx *ioctx;
> unsigned long ctx;
> long ret;
>
> @@ -1304,14 +1339,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events,
> aio_context_t __user *, ctxp)
> if (unlikely(ret))
> goto out;
>
> - ret = -EINVAL;
> - if (unlikely(ctx || nr_events == 0)) {
> - pr_debug("EINVAL: ctx %lu nr_events %u\n",
> - ctx, nr_events);
> - goto out;
> - }
> -
> - ioctx = ioctx_alloc(nr_events);
> + ioctx = io_setup_flags(ctx, nr_events, 0);
> ret = PTR_ERR(ioctx);
> if (!IS_ERR(ioctx)) {
> ret = put_user(ioctx->user_id, ctxp);
> @@ -1327,7 +1355,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events,
> aio_context_t __user *, ctxp)
> #ifdef CONFIG_COMPAT
> COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
> {
> - struct kioctx *ioctx = NULL;
> + struct kioctx *ioctx;
> unsigned long ctx;
> long ret;
>
> @@ -1335,23 +1363,14 @@ COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events,
> u32 __user *, ctx32p)
> if (unlikely(ret))
> goto out;
>
> - ret = -EINVAL;
> - if (unlikely(ctx || nr_events == 0)) {
> - pr_debug("EINVAL: ctx %lu nr_events %u\n",
> - ctx, nr_events);
> - goto out;
> - }
> -
> - ioctx = ioctx_alloc(nr_events);
> + ioctx = io_setup_flags(ctx, nr_events, 0);
> ret = PTR_ERR(ioctx);
> if (!IS_ERR(ioctx)) {
> - /* truncating is ok because it's a user address */
> - ret = put_user((u32)ioctx->user_id, ctx32p);
> + ret = put_user(ioctx->user_id, ctx32p);
> if (ret)
> kill_ioctx(current->mm, ioctx, NULL);
> percpu_ref_put(&ioctx->users);
> }
> -
> out:
> return ret;
> }
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 2ac3d13a915b..a20a663d583f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -287,6 +287,9 @@ static inline void addr_limit_user_check(void)
> */
> #ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
> asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
> +asmlinkage long sys_io_setup2(unsigned, unsigned, struct iocb __user *,
> + void __user *, void __user *,
> + aio_context_t __user *);
> asmlinkage long sys_io_destroy(aio_context_t ctx);
> asmlinkage long sys_io_submit(aio_context_t, long,
> struct iocb __user * __user *);
> diff --git a/include/uapi/asm-generic/unistd.h
> b/include/uapi/asm-generic/unistd.h
> index 538546edbfbd..b4527ed373b0 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -738,9 +738,11 @@ __SYSCALL(__NR_statx, sys_statx)
> __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
> #define __NR_rseq 293
> __SYSCALL(__NR_rseq, sys_rseq)
> +#define __NR_io_setup2 294
> +__SYSCALL(__NR_io_setup2, sys_io_setup2)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 294
> +#define __NR_syscalls 295
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index df556175be50..17c8b4393669 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -37,6 +37,7 @@ asmlinkage long sys_ni_syscall(void)
> */
>
> COND_SYSCALL(io_setup);
> +COND_SYSCALL(io_setup2);
> COND_SYSCALL_COMPAT(io_setup);
> COND_SYSCALL(io_destroy);
> COND_SYSCALL(io_submit);