On 1/29/19 4:08 PM, Jann Horn wrote:
> On Wed, Jan 30, 2019 at 12:06 AM Jens Axboe <[email protected]> wrote:
>> On 1/29/19 4:03 PM, Jann Horn wrote:
>>> On Tue, Jan 29, 2019 at 11:56 PM Jens Axboe <[email protected]> wrote:
>>>> On 1/29/19 3:44 PM, Jann Horn wrote:
>>>>> On Tue, Jan 29, 2019 at 8:27 PM Jens Axboe <[email protected]> wrote:
>>>>>> If we have fixed user buffers, we can map them into the kernel when we
>>>>>> setup the io_context. That avoids the need to do get_user_pages() for
>>>>>> each and every IO.
>>>>>>
>>>>>> To utilize this feature, the application must call io_uring_register()
>>>>>> after having setup an io_uring context, passing in
>>>>>> IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer
>>>>>> to an iovec array, and the nr_args should contain how many iovecs the
>>>>>> application wishes to map.
>>>>>>
>>>>>> If successful, these buffers are now mapped into the kernel, eligible
>>>>>> for IO. To use these fixed buffers, the application must use the
>>>>>> IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
>>>>>> set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
>>>>>> must point to somewhere inside the indexed buffer.
>>>>>>
>>>>>> The application may register buffers throughout the lifetime of the
>>>>>> io_uring context. It can call io_uring_register() with
>>>>>> IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
>>>>>> buffers, and then register a new set. The application need not
>>>>>> unregister buffers explicitly before shutting down the io_uring context.
>>> [...]
>>>>>> +       imu = &ctx->user_bufs[index];
>>>>>> +       buf_addr = READ_ONCE(sqe->addr);
>>>>>> +       if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + 
>>>>>> imu->len)
>>>>>
>>>>> This can wrap around if `buf_addr` or `len` is very big, right? Then
>>>>> you e.g. get past the first check because `buf_addr` is sufficiently
>>>>> big, and get past the second check because `buf_addr + len` wraps
>>>>> around and becomes small.
>>>>
>>>> Good point. I wonder if we have a verification helper for something like
>>>> this?
>>>
>>> check_add_overflow() exists, I guess that might help a bit. I don't
>>> think I've seen a more specific helper for this situation.
>>
>> Hmm, not super appropriate. How about something ala:
>>
>> if (buf_addr + len < buf_addr)
>>     ... overflow ...
>>
>> ?
> 
> Sure, sounds good.

Just folded in this incremental, which should fix all the issues outlined
in your email.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7364feebafed..d42541357969 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -751,7 +751,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 {
        size_t len = READ_ONCE(sqe->len);
        struct io_mapped_ubuf *imu;
-       int buf_index, index;
+       unsigned index, buf_index;
        size_t offset;
        u64 buf_addr;
 
@@ -763,9 +763,12 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
        if (unlikely(buf_index >= ctx->nr_user_bufs))
                return -EFAULT;
 
-       index = array_index_nospec(buf_index, ctx->sq_entries);
+       index = array_index_nospec(buf_index, ctx->nr_user_bufs);
        imu = &ctx->user_bufs[index];
        buf_addr = READ_ONCE(sqe->addr);
+
+       if (buf_addr + len < buf_addr)
+               return -EFAULT;
        if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
                return -EFAULT;
 
@@ -1602,6 +1605,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct 
sqe_submit *sqes,
 
 static int io_sq_thread(void *data)
 {
+       struct io_uring_sqe lsqe[IO_IOPOLL_BATCH];
        struct sqe_submit sqes[IO_IOPOLL_BATCH];
        struct io_ring_ctx *ctx = data;
        struct mm_struct *cur_mm = NULL;
@@ -1701,6 +1705,14 @@ static int io_sq_thread(void *data)
                i = 0;
                all_fixed = true;
                do {
+                       /*
+                        * Ensure sqe is stable between checking if we need
+                        * user access, and actually importing the iovec
+                        * further down the stack.
+                        */
+                       memcpy(&lsqe[i], sqes[i].sqe, sizeof(lsqe[i]));
+                       sqes[i].sqe = &lsqe[i];
+
                        if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
                                all_fixed = false;
 
@@ -2081,7 +2093,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct 
iovec *dst,
        struct iovec __user *src;
 
 #ifdef CONFIG_COMPAT
-       if (in_compat_syscall()) {
+       if (ctx->compat) {
                struct compat_iovec __user *ciovs;
                struct compat_iovec ciov;
 
@@ -2103,7 +2115,6 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct 
iovec *dst,
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
                                  unsigned nr_args)
 {
-       struct vm_area_struct **vmas = NULL;
        struct page **pages = NULL;
        int i, j, got_pages = 0;
        int ret = -EINVAL;
@@ -2138,7 +2149,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
*ctx, void __user *arg,
                 * submitted if they are wrong.
                 */
                ret = -EFAULT;
-               if (!iov.iov_base)
+               if (!iov.iov_base || !iov.iov_len)
                        goto err;
 
                /* arbitrary limit, but we need something */
@@ -2155,14 +2166,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
*ctx, void __user *arg,
                        goto err;
 
                if (!pages || nr_pages > got_pages) {
-                       kfree(vmas);
                        kfree(pages);
                        pages = kmalloc_array(nr_pages, sizeof(struct page *),
                                                GFP_KERNEL);
-                       vmas = kmalloc_array(nr_pages,
-                                       sizeof(struct vma_area_struct *),
-                                       GFP_KERNEL);
-                       if (!pages || !vmas) {
+                       if (!pages) {
                                io_unaccount_mem(ctx, nr_pages);
                                goto err;
                        }
@@ -2176,32 +2183,18 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
*ctx, void __user *arg,
                        goto err;
                }
 
-               down_write(&current->mm->mmap_sem);
-               pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
-                                               pages, vmas);
-               if (pret == nr_pages) {
-                       /* don't support file backed memory */
-                       for (j = 0; j < nr_pages; j++) {
-                               struct vm_area_struct *vma = vmas[j];
+               down_read(&current->mm->mmap_sem);
+               pret = get_user_pages_longterm(ubuf, nr_pages,
+                                               FOLL_WRITE | FOLL_ANON, pages,
+                                               NULL);
+               up_read(&current->mm->mmap_sem);
 
-                               if (vma->vm_file) {
-                                       ret = -EOPNOTSUPP;
-                                       break;
-                               }
-                       }
-               } else {
-                       ret = pret < 0 ? pret : -EFAULT;
-               }
-               up_write(&current->mm->mmap_sem);
-               if (ret) {
-                       /*
-                        * if we did partial map, or found file backed vmas,
-                        * release any pages we did get
-                        */
+               if (pret != nr_pages) {
                        if (pret > 0) {
                                for (j = 0; j < pret; j++)
                                        put_page(pages[j]);
                        }
+                       ret = pret < 0 ? pret : -EFAULT;
                        io_unaccount_mem(ctx, nr_pages);
                        goto err;
                }
@@ -2224,12 +2217,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
*ctx, void __user *arg,
                imu->nr_bvecs = nr_pages;
        }
        kfree(pages);
-       kfree(vmas);
        ctx->nr_user_bufs = nr_args;
        return 0;
 err:
        kfree(pages);
-       kfree(vmas);
        io_sqe_buffer_unregister(ctx);
        return ret;
 }

-- 
Jens Axboe

Reply via email to