Am 18.03.2026 um 16:32 hat Hanna Czenczek geschrieben:
> Short reads/writes can happen.  One way to reproduce them is via our
> FUSE export, with the following diff applied (%s/escaped // to apply --
> if you put plain diffs in commit messages, git-am will apply them, and I
> would rather avoid breaking FUSE accidentally via this patch):
> 
> escaped diff --git a/block/export/fuse.c b/block/export/fuse.c
> escaped index a2a478d293..67dc50a412 100644
> escaped --- a/block/export/fuse.c
> escaped +++ b/block/export/fuse.c
> @@ -828,7 +828,7 @@ static ssize_t coroutine_fn GRAPH_RDLOCK
>  fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>               const struct fuse_init_in_compat *in)
>  {
> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> +    const uint32_t supported_flags = FUSE_ASYNC_READ;
> 
>      if (in->major != 7) {
>          error_report("FUSE major version mismatch: We have 7, but kernel has 
> %"
> @@ -1060,6 +1060,8 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t 
> offset, uint32_t size)
>      void *buf;
>      int ret;
> 
> +    size = MIN(size, 4096);
> +
>      /* Limited by max_read, should not happen */
>      if (size > FUSE_MAX_READ_BYTES) {
>          return -EINVAL;
> @@ -1110,6 +1112,8 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out 
> *out,
>      int64_t blk_len;
>      int ret;
> 
> +    size = MIN(size, 4096);
> +
>      QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
>      /* Limited by max_write, should not happen */
>      if (size > FUSE_MAX_WRITE_BYTES) {
> 
> Then:
> $ ./qemu-img create -f raw test.raw 8k
> Formatting 'test.raw', fmt=raw size=8192
> $ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw
> wrote 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec)
> $ hexdump -C test.raw
> 00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
> *
> 00002000
> 
> With aio=threads, short I/O works:
> $ storage-daemon/qemu-storage-daemon \
>     --blockdev file,node-name=test,filename=test.raw \
>     --export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true
> 
> Other shell:
> $ ./qemu-io --image-opts -c 'read -P 42 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=threads
> read 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (36.563 MiB/sec and 4680.0923 ops/sec)
> $ ./qemu-io --image-opts -c 'write -P 23 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=threads
> wrote 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (35.995 MiB/sec and 4607.2970 ops/sec)
> $ hexdump -C test.raw
> 00000000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
> *
> 00002000
> 
> But with aio=native, it does not:
> $ ./qemu-io --image-opts -c 'read -P 23 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=native
> Pattern verification failed at offset 0, 8192 bytes
> read 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (86.155 MiB/sec and 11027.7900 ops/sec)
> $ ./qemu-io --image-opts -c 'write -P 42 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=native
> write failed: No space left on device
> $ hexdump -C test.raw
> 00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
> *
> 00001000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
> *
> 00002000
> 
> This patch fixes that.
> 
> Signed-off-by: Hanna Czenczek <[email protected]>
> ---
>  block/linux-aio.c | 61 ++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 53 insertions(+), 8 deletions(-)
> 
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 1f25339dc9..01621d4794 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -46,6 +46,10 @@ struct qemu_laiocb {
>      size_t nbytes;
>      QEMUIOVector *qiov;
>  
> +    /* For handling short reads/writes */
> +    size_t total_done;
> +    QEMUIOVector resubmit_qiov;
> +
>      int type;
>      BdrvRequestFlags flags;
>      uint64_t dev_max_batch;
> @@ -73,28 +77,61 @@ struct LinuxAioState {
>  };
>  
>  static void ioq_submit(LinuxAioState *s);
> +static int laio_do_submit(struct qemu_laiocb *laiocb);
>  
>  static inline ssize_t io_event_ret(struct io_event *ev)
>  {
>      return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
>  }
>  
> +/**
> + * Retry tail of short requests.
> + */
> +static int laio_resubmit_short_io(struct qemu_laiocb *laiocb, size_t done)
> +{
> +    QEMUIOVector *resubmit_qiov = &laiocb->resubmit_qiov;
> +
> +    laiocb->total_done += done;
> +
> +    if (!resubmit_qiov->iov) {
> +        qemu_iovec_init(resubmit_qiov, laiocb->qiov->niov);
> +    } else {
> +        qemu_iovec_reset(resubmit_qiov);
> +    }
> +    qemu_iovec_concat(resubmit_qiov, laiocb->qiov,
> +                      laiocb->total_done, laiocb->nbytes - 
> laiocb->total_done);
> +
> +    return laio_do_submit(laiocb);
> +}
> +
>  /*
>   * Completes an AIO request.
>   */
>  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
>  {
> -    int ret;
> +    ssize_t ret;
>  
>      ret = laiocb->ret;
>      if (ret != -ECANCELED) {
> -        if (ret == laiocb->nbytes) {
> +        if (ret == laiocb->nbytes - laiocb->total_done) {
>              ret = 0;
> +        } else if (ret > 0 && (laiocb->type == QEMU_AIO_READ ||
> +                               laiocb->type == QEMU_AIO_WRITE)) {
> +            ret = laio_resubmit_short_io(laiocb, ret);
> +            if (!ret) {
> +                return;
> +            }
>          } else if (ret >= 0) {
> -            /* Short reads mean EOF, pad with zeros. */
> +            /*
> +             * For normal reads and writes, we only get here if ret == 0, 
> which
> +             * means EOF for reads and ENOSPC for writes.
> +             * For zone-append, we get here with any ret >= 0, which we just
> +             * treat as ENOSPC, too (safer than resubmitting, probably, but 
> not
> +             * 100 % clear).
> +             */
>              if (laiocb->type == QEMU_AIO_READ) {
> -                qemu_iovec_memset(laiocb->qiov, ret, 0,
> -                    laiocb->qiov->size - ret);
> +                qemu_iovec_memset(laiocb->qiov, laiocb->total_done, 0,
> +                                  laiocb->qiov->size - laiocb->total_done);
>              } else {
>                  ret = -ENOSPC;
>              }
> @@ -102,6 +139,7 @@ static void qemu_laio_process_completion(struct 
> qemu_laiocb *laiocb)
>      }
>  
>      laiocb->ret = ret;
> +    qemu_iovec_destroy(&laiocb->resubmit_qiov);

Calling qemu_iovec_destroy() for a qiov that has potentially never been
initialised feels a bit unsafe to me. It will work in practice for the
current implementation, but maybe make this one conditional on
laiocb->resubmit_qiov.iov, too? (Which is already making assumptions
about the internals of QEMUIOVector, but that we'll have an iov after
initialising the qiov with qemu_iovec_init() above will probably never
change.)

>      /*
>       * If the coroutine is already entered it must be in ioq_submit() and
> @@ -380,23 +418,30 @@ static int laio_do_submit(struct qemu_laiocb *laiocb)
>      int fd = laiocb->fd;
>      off_t offset = laiocb->offset;

I wonder if making it laiocb->offset + laiocb->total_done here wouldn't
be more robust than having the addition in every call below.

> +    if (laiocb->resubmit_qiov.iov) {
> +        qiov = &laiocb->resubmit_qiov;
> +    }
> +
>      switch (laiocb->type) {
>      case QEMU_AIO_WRITE:
>  #ifdef HAVE_IO_PREP_PWRITEV2
>      {
>          int laio_flags = (laiocb->flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
> -        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, 
> laio_flags);
> +        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov,
> +                         offset + laiocb->total_done, laio_flags);
>      }
>  #else
>          assert(laiocb->flags == 0);
> -        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
> +        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov,
> +                        offset + laiocb->total_done);
>  #endif
>          break;
>      case QEMU_AIO_ZONE_APPEND:
>          io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
>          break;
>      case QEMU_AIO_READ:
> -        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
> +        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov,
> +                       offset + laiocb->total_done);
>          break;
>      case QEMU_AIO_FLUSH:
>          io_prep_fdsync(iocbs, fd);

Reviewed-by: Kevin Wolf <[email protected]>


Reply via email to