Tang Chen <[email protected]> writes:

> AIO ring page migration has been implemented by the following patch:
>
>         
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/fs/aio.c?id=36bc08cc01709b4a9bb563b35aa530241ddc63e3
>
> In this patch, ctx->completion_lock is used to prevent other processes
> from accessing the ring page being migrated.
>
> But in aio_setup_ring(), ioctx_add_table() and aio_read_events_ring(),
> when writing to the ring page, they didn't take ctx->completion_lock.
>
> As a result, for example, we have the following problem:
>
>             thread 1                      |              thread 2
>                                           |
> aio_migratepage()                         |
>  |-> take ctx->completion_lock            |
>  |-> migrate_page_copy(new, old)          |
>  |   *NOW*, ctx->ring_pages[idx] == old   |
>                                           |
>                                           |    *NOW*, ctx->ring_pages[idx] == 
> old
>                                           |    aio_read_events_ring()
>                                           |     |-> ring = 
> kmap_atomic(ctx->ring_pages[0])
>                                           |     |-> ring->head = head;        
>   *HERE, write to the old ring page*
>                                           |     |-> kunmap_atomic(ring);
>                                           |
>  |-> ctx->ring_pages[idx] = new           |
>  |   *BUT NOW*, the content of            |
>  |    ring_pages[idx] is old.             |
>  |-> release ctx->completion_lock         |
>
> As above, the new ring page will not be updated.
>
> The solution is taking ctx->completion_lock in thread 2, which means,
> in aio_setup_ring(), ioctx_add_table() and aio_read_events_ring() when
> writing to ring pages.

Thanks for the good explanation and for adding comments to the code.
This looks right to me.  I guess the only issue I have with it is that
the code paths changed don't run in interrupt context, so I think you
could just use spin_lock_irq.  That's not a big deal, though.

Reviewed-by: Jeff Moyer <[email protected]>


> Reported-by: Yasuaki Ishimatsu <[email protected]>
> Signed-off-by: Tang Chen <[email protected]>
> ---
>  fs/aio.c | 33 +++++++++++++++++++++++++++++++++
>  1 file changed, 33 insertions(+)
>
> diff --git a/fs/aio.c b/fs/aio.c
> index 062a5f6..50c089c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -366,6 +366,7 @@ static int aio_setup_ring(struct kioctx *ctx)
>       int nr_pages;
>       int i;
>       struct file *file;
> +     unsigned long flags;
>  
>       /* Compensate for the ring buffer's head/tail overlap entry */
>       nr_events += 2; /* 1 is required, 2 for good luck */
> @@ -437,6 +438,14 @@ static int aio_setup_ring(struct kioctx *ctx)
>       ctx->user_id = ctx->mmap_base;
>       ctx->nr_events = nr_events; /* trusted copy */
>  
> +     /*
> +      * The aio ring pages are user space pages, so they can be migrated.
> +      * When writing to an aio ring page, we should ensure the page is not
> +      * being migrated. Aio page migration procedure is protected by
> +      * ctx->completion_lock, so we add this lock here.
> +      */
> +     spin_lock_irqsave(&ctx->completion_lock, flags);
> +
>       ring = kmap_atomic(ctx->ring_pages[0]);
>       ring->nr = nr_events;   /* user copy */
>       ring->id = ~0U;
> @@ -448,6 +457,8 @@ static int aio_setup_ring(struct kioctx *ctx)
>       kunmap_atomic(ring);
>       flush_dcache_page(ctx->ring_pages[0]);
>  
> +     spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +
>       return 0;
>  }
>  
> @@ -542,6 +553,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
> mm_struct *mm)
>       unsigned i, new_nr;
>       struct kioctx_table *table, *old;
>       struct aio_ring *ring;
> +     unsigned long flags;
>  
>       spin_lock(&mm->ioctx_lock);
>       rcu_read_lock();
> @@ -556,9 +568,19 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
> mm_struct *mm)
>                                       rcu_read_unlock();
>                                       spin_unlock(&mm->ioctx_lock);
>  
> +                                     /*
> +                                      * Accessing ring pages must be done
> +                                      * holding ctx->completion_lock to
> +                                      * prevent aio ring page migration
> +                                      * procedure from migrating ring pages.
> +                                      */
> +                                     spin_lock_irqsave(&ctx->completion_lock,
> +                                                       flags);
>                                       ring = kmap_atomic(ctx->ring_pages[0]);
>                                       ring->id = ctx->id;
>                                       kunmap_atomic(ring);
> +                                     spin_unlock_irqrestore(
> +                                             &ctx->completion_lock, flags);
>                                       return 0;
>                               }
>  
> @@ -1021,6 +1043,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
>       unsigned head, tail, pos;
>       long ret = 0;
>       int copy_ret;
> +     unsigned long flags;
>  
>       mutex_lock(&ctx->ring_lock);
>  
> @@ -1066,11 +1089,21 @@ static long aio_read_events_ring(struct kioctx *ctx,
>               head %= ctx->nr_events;
>       }
>  
> +     /*
> +      * The aio ring pages are user space pages, so they can be migrated.
> +      * When writing to an aio ring page, we should ensure the page is not
> +      * being migrated. Aio page migration procedure is protected by
> +      * ctx->completion_lock, so we add this lock here.
> +      */
> +     spin_lock_irqsave(&ctx->completion_lock, flags);
> +
>       ring = kmap_atomic(ctx->ring_pages[0]);
>       ring->head = head;
>       kunmap_atomic(ring);
>       flush_dcache_page(ctx->ring_pages[0]);
>  
> +     spin_unlock_irqrestore(&ctx->completion_lock, flags);
> +
>       pr_debug("%li  h%u t%u\n", ret, head, tail);
>  
>       put_reqs_available(ctx, ret);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to