On Fri, Feb 8, 2019 at 6:35 PM Jens Axboe <[email protected]> wrote:
> We normally have to fget/fput for each IO we do on a file. Even with
> the batching we do, the cost of the atomic inc/dec of the file usage
> count adds up.
>
> This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes
> for the io_uring_register(2) system call. The arguments passed in must
> be an array of __s32 holding file descriptors, and nr_args should hold
> the number of file descriptors the application wishes to pin for the
> duration of the io_uring instance (or until IORING_UNREGISTER_FILES is
> called).
>
> When used, the application must set IOSQE_FIXED_FILE in the sqe->flags
> member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd
> to the index in the array passed in to IORING_REGISTER_FILES.
>
> Files are automatically unregistered when the io_uring instance is torn
> down. An application need only unregister if it wishes to register a new
> set of fds.
I think the overall concept here is still broken: You're giving the
user_files to the GC, and I think the GC can drop their refcounts, but
I don't see you actually getting feedback from the GC anywhere that
would let the GC break your references? E.g. in io_prep_rw() you grab
file pointers from ctx->user_files after simply checking
ctx->nr_user_files, and there is no path from the GC that touches
those fields. As far as I can tell, the GC is just going to go through
unix_destruct_scm() and drop references on your files, causing
use-after-free.
But the unix GC is complicated, and maybe I'm just missing something...
> +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
> +{
> +#if defined(CONFIG_UNIX)
> + if (ctx->ring_sock) {
> + struct sock *sock = ctx->ring_sock->sk;
> + struct sk_buff *skb;
> +
> + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
> + kfree_skb(skb);
> + }
> +#else
> + int i;
> +
> + for (i = 0; i < ctx->nr_user_files; i++)
> + fput(ctx->user_files[i]);
> +#endif
> +}
> +
> +static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
> +{
> + if (!ctx->user_files)
> + return -ENXIO;
> +
> + __io_sqe_files_unregister(ctx);
> + kfree(ctx->user_files);
> + ctx->user_files = NULL;
> + return 0;
> +}
> +
> +#if defined(CONFIG_UNIX)
> +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
> +{
> + struct scm_fp_list *fpl;
> + struct sk_buff *skb;
> + int i;
> +
> + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
> + if (!fpl)
> + return -ENOMEM;
> +
> + skb = alloc_skb(0, GFP_KERNEL);
> + if (!skb) {
> + kfree(fpl);
> + return -ENOMEM;
> + }
> +
> + skb->sk = ctx->ring_sock->sk;
> + skb->destructor = unix_destruct_scm;
> +
> + fpl->user = get_uid(ctx->user);
> + for (i = 0; i < nr; i++) {
> + fpl->fp[i] = get_file(ctx->user_files[i + offset]);
> + unix_inflight(fpl->user, fpl->fp[i]);
> + fput(fpl->fp[i]);
This pattern is almost always superfluous. You increment the file's
refcount, maybe insert the file into a list (essentially), and drop
the file's refcount back down. You're already holding a stable
reference, and you're not temporarily lending that to anyone else.
> + }
> +
> + fpl->max = fpl->count = nr;
> + UNIXCB(skb).fp = fpl;
> + skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
> + return 0;
> +}
> +
> +/*
> + * If UNIX sockets are enabled, fd passing can cause a reference cycle which
> + * causes regular reference counting to break down. We rely on the UNIX
> + * garbage collection to take care of this problem for us.
> + */
> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
> +{
> + unsigned left, total;
> + int ret = 0;
> +
> + total = 0;
> + left = ctx->nr_user_files;
> + while (left) {
> + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
> + int ret;
> +
> + ret = __io_sqe_files_scm(ctx, this_files, total);
> + if (ret)
> + break;
> + left -= this_files;
> + total += this_files;
> + }
> +
> + return ret;
> +}
> +#else
> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
> +{
> + return 0;
> +}
> +#endif
> +
> +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> + unsigned nr_args)
> +{
> + __s32 __user *fds = (__s32 __user *) arg;
> + int fd, ret = 0;
> + unsigned i;
> +
> + if (ctx->user_files)
> + return -EBUSY;
> + if (!nr_args)
> + return -EINVAL;
> + if (nr_args > IORING_MAX_FIXED_FILES)
> + return -EMFILE;
> +
> + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL);
> + if (!ctx->user_files)
> + return -ENOMEM;
> +
> + for (i = 0; i < nr_args; i++) {
> + ret = -EFAULT;
> + if (copy_from_user(&fd, &fds[i], sizeof(fd)))
> + break;
> +
> + ctx->user_files[i] = fget(fd);
> +
> + ret = -EBADF;
> + if (!ctx->user_files[i])
> + break;
> + /*
> + * Don't allow io_uring instances to be registered. If UNIX
> + * isn't enabled, then this causes a reference cycle and this
> + * instance can never get freed. If UNIX is enabled we'll
> + * handle it just fine, but there's still no point in allowing
> + * a ring fd as it doesn't suppor regular read/write anyway.
nit: support
> + */
> + if (ctx->user_files[i]->f_op == &io_uring_fops) {
> + fput(ctx->user_files[i]);
> + break;
> + }
> + ctx->nr_user_files++;
> + ret = 0;
> + }
> +
> + if (!ret)
> + ret = io_sqe_files_scm(ctx);
> + if (ret)
> + io_sqe_files_unregister(ctx);
> +
> + return ret;
> +}
> +
> static int io_sq_offload_start(struct io_ring_ctx *ctx)
> {
> int ret;
> @@ -1521,14 +1708,16 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
> destroy_workqueue(ctx->sqo_wq);
> if (ctx->sqo_mm)
> mmdrop(ctx->sqo_mm);
> +
> + io_iopoll_reap_events(ctx);
> + io_sqe_buffer_unregister(ctx);
> + io_sqe_files_unregister(ctx);
> +
> #if defined(CONFIG_UNIX)
> if (ctx->ring_sock)
> sock_release(ctx->ring_sock);
> #endif
>
> - io_iopoll_reap_events(ctx);
> - io_sqe_buffer_unregister(ctx);