On Mon, Jan 28, 2019 at 10:36 PM Jens Axboe <[email protected]> wrote:
> If we have fixed user buffers, we can map them into the kernel when we
> setup the io_context. That avoids the need to do get_user_pages() for
> each and every IO.
[...]
> +static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> + void __user *arg, unsigned nr_args)
> +{
> + int ret;
> +
> + /* Drop our initial ref and wait for the ctx to be fully idle */
> + percpu_ref_put(&ctx->refs);
The line above drops a reference that you just got in the caller...
> + percpu_ref_kill(&ctx->refs);
> + wait_for_completion(&ctx->ctx_done);
> +
> + switch (opcode) {
> + case IORING_REGISTER_BUFFERS:
> + ret = io_sqe_buffer_register(ctx, arg, nr_args);
> + break;
> + case IORING_UNREGISTER_BUFFERS:
> + ret = -EINVAL;
> + if (arg || nr_args)
> + break;
> + ret = io_sqe_buffer_unregister(ctx);
> + break;
> + default:
> + ret = -EINVAL;
> + break;
> + }
> +
> + /* bring the ctx back to life */
> + reinit_completion(&ctx->ctx_done);
> + percpu_ref_resurrect(&ctx->refs);
> + percpu_ref_get(&ctx->refs);
And then this line takes a reference that the caller will immediately
drop again? Why?
> + return ret;
> +}
> +
> +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
> + void __user *, arg, unsigned int, nr_args)
> +{
> + struct io_ring_ctx *ctx;
> + long ret = -EBADF;
> + struct fd f;
> +
> + f = fdget(fd);
> + if (!f.file)
> + return -EBADF;
> +
> + ret = -EOPNOTSUPP;
> + if (f.file->f_op != &io_uring_fops)
> + goto out_fput;
> +
> + ret = -ENXIO;
> + ctx = f.file->private_data;
> + if (!percpu_ref_tryget(&ctx->refs))
> + goto out_fput;
If you are holding the uring_lock of a ctx that can be accessed
through a file descriptor (which you do just after this point), you
know that the percpu_ref isn't zero, right? Why are you doing the
tryget here?
> + ret = -EBUSY;
> + if (mutex_trylock(&ctx->uring_lock)) {
> + ret = __io_uring_register(ctx, opcode, arg, nr_args);
> + mutex_unlock(&ctx->uring_lock);
> + }
> + io_ring_drop_ctx_refs(ctx, 1);
> +out_fput:
> + fdput(f);
> + return ret;
> +}