On Fri, Jun 09, 2017 at 12:49:34PM +0300, Kirill Tkhai wrote:
> During implementation aio support for Checkpoint-Restore
> in Userspace project (CRIU), we found, that current
> kernel functionality does not allow to wait for
> completion of all submitted requests. Checkpoint software
> can't determine if there is no in-flight requests, i.e.
> the process aio rings memory is ready for dumping.

Simply put: no.  There is no way this command will ever complete under
various conditions - if another thread submits i/o, if the in flight i/o
is waiting on non-disk i/o, etc.  If you want to wait until all aios of a
task are complete, kill the process and wait for the zombie to be reaped.  
This is a simplistic approach to checkpointing that does not solve all
issues related to checkpoint aio.

                -ben

> The patch solves this problem. Also, the interface, it
> introduces, may be useful for other in-userspace users.
> Here new IOCB_CMD_WAIT_ALL cmd, which idea is to make
> the caller wait till the sum of completed and available
> requests becomes equal to (ctx->nr_events - 1). If they
> are equal, then there is no aio requests in-flight in
> the system.
> 
> Since available requests are per-cpu, we need synchronization
> with their other readers and writers. Variable ctx->dead
> is used for that, and we set it in 1 during synchronization.
> Now let's look how we become sure, that concurrents can
> see it on other cpus.
> 
> There is two cases. When the task is the only user of its
> memory, it races with aio_complete() only. This function
> takes ctx->completion_lock, so we simply use it to synchronize
> during per-cpu reading.
> 
> When there are more users of current->mm, we base on that
> put_reqs_available() and get_reqs_available() are already
> interrupts-free. Primitives local_irq_disable and
> local_irq_enable() work as rcu_read_xxx_sched() barriers,
> and waiter uses synchronize_sched() to propogate ctx->dead
> visability on other cpus. This RCU primitive works as
> full memory barrier, so nobody will use percpu reqs_available
> after we returned from it, and the waiting comes down
> to the first case further actions. The good point
> is the solution does not affect on existing code
> performance in any way, as it does not change it.
> 
> Couple more words about checkpoint/restore. We need
> especially this functionality, and we are not going to
> dump in-flight request for now, because the experiments
> show, that if dumping of a container was failed by
> another subsystem reasons, it's not always possible
> to queue cancelled requests back (file descriptors
> may be already closed, memory pressure). So, here is
> nothing about dumping.
> 
> It seams, this functionality will be useful not only
> for us, and others may use the new command like
> other standard aio commands.
> 
> Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com>
> ---
>  fs/aio.c                     |   71 
> +++++++++++++++++++++++++++++++++++++++---
>  include/uapi/linux/aio_abi.h |    1 +
>  2 files changed, 67 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index f52d925ee259..447fa4283c7c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -899,7 +899,11 @@ static void put_reqs_available(struct kioctx *ctx, 
> unsigned nr)
>       struct kioctx_cpu *kcpu;
>       unsigned long flags;
>  
> -     local_irq_save(flags);
> +     local_irq_save(flags); /* Implies rcu_read_lock_sched() */
> +     if (atomic_read(&ctx->dead)) {
> +             atomic_add(nr, &ctx->reqs_available);
> +             goto unlock;
> +     }
>       kcpu = this_cpu_ptr(ctx->cpu);
>       kcpu->reqs_available += nr;
>  
> @@ -907,8 +911,8 @@ static void put_reqs_available(struct kioctx *ctx, 
> unsigned nr)
>               kcpu->reqs_available -= ctx->req_batch;
>               atomic_add(ctx->req_batch, &ctx->reqs_available);
>       }
> -
> -     local_irq_restore(flags);
> +unlock:
> +     local_irq_restore(flags); /* Implies rcu_read_unlock_sched() */
>  }
>  
>  static bool get_reqs_available(struct kioctx *ctx)
> @@ -917,7 +921,9 @@ static bool get_reqs_available(struct kioctx *ctx)
>       bool ret = false;
>       unsigned long flags;
>  
> -     local_irq_save(flags);
> +     local_irq_save(flags); /* Implies rcu_read_lock_sched() */
> +     if (atomic_read(&ctx->dead))
> +             goto out;
>       kcpu = this_cpu_ptr(ctx->cpu);
>       if (!kcpu->reqs_available) {
>               int old, avail = atomic_read(&ctx->reqs_available);
> @@ -937,7 +943,7 @@ static bool get_reqs_available(struct kioctx *ctx)
>       ret = true;
>       kcpu->reqs_available--;
>  out:
> -     local_irq_restore(flags);
> +     local_irq_restore(flags); /* Implies rcu_read_unlock_sched() */
>       return ret;
>  }
>  
> @@ -1533,6 +1539,58 @@ static ssize_t aio_write(struct kiocb *req, struct 
> iocb *iocb, bool vectored,
>       return ret;
>  }
>  
> +static bool reqs_completed(struct kioctx *ctx)
> +{
> +     unsigned available;
> +     spin_lock_irq(&ctx->completion_lock);
> +     available = atomic_read(&ctx->reqs_available) + ctx->completed_events;
> +     spin_unlock_irq(&ctx->completion_lock);
> +
> +     return (available == ctx->nr_events - 1);
> +}
> +
> +static int aio_wait_all(struct kioctx *ctx)
> +{
> +     unsigned users, reqs = 0;
> +     struct kioctx_cpu *kcpu;
> +     int cpu, ret;
> +
> +     if (atomic_xchg(&ctx->dead, 1))
> +             return -EBUSY;
> +
> +     users = atomic_read(&current->mm->mm_users);
> +     if (users > 1) {
> +             /*
> +              * Wait till concurrent threads and aio_complete() see
> +              * dead flag. Implies full memory barrier on all cpus.
> +              */
> +             synchronize_sched();
> +     } else {
> +             /*
> +              * Sync with aio_complete() to be sure it puts reqs_available,
> +              * when dead flag is already seen.
> +              */
> +             spin_lock_irq(&ctx->completion_lock);
> +     }
> +
> +     for_each_possible_cpu(cpu) {
> +             kcpu = per_cpu_ptr(ctx->cpu, cpu);
> +             reqs += kcpu->reqs_available;
> +             kcpu->reqs_available = 0;
> +     }
> +
> +     if (users == 1)
> +             spin_unlock_irq(&ctx->completion_lock);
> +
> +     atomic_add(reqs, &ctx->reqs_available);
> +
> +     ret = wait_event_interruptible(ctx->wait, reqs_completed(ctx));
> +
> +     atomic_set(&ctx->dead, 0);
> +
> +     return ret;
> +}
> +
>  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
>                        struct iocb *iocb, bool compat)
>  {
> @@ -1556,6 +1614,9 @@ static int io_submit_one(struct kioctx *ctx, struct 
> iocb __user *user_iocb,
>               return -EINVAL;
>       }
>  
> +     if (iocb->aio_lio_opcode == IOCB_CMD_WAIT_ALL)
> +             return aio_wait_all(ctx);
> +
>       req = aio_get_req(ctx);
>       if (unlikely(!req))
>               return -EAGAIN;
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index bb2554f7fbd1..29291946d4f1 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -44,6 +44,7 @@ enum {
>       IOCB_CMD_NOOP = 6,
>       IOCB_CMD_PREADV = 7,
>       IOCB_CMD_PWRITEV = 8,
> +     IOCB_CMD_WAIT_ALL = 9,
>  };
>  
>  /*
> 

-- 
"Thought is the essence of where you are now."

Reply via email to