AioContext has its own io_uring instance for file descriptor monitoring. The disk I/O io_uring code was developed separately. Originally I thought the characteristics of file descriptor monitoring and disk I/O were too different, requiring separate io_uring instances.
Now it has become clear to me that it's feasible to share a single io_uring instance for file descriptor monitoring and disk I/O. We're not using io_uring's IOPOLL feature or anything else that would require a separate instance. Unify block/io_uring.c and util/fdmon-io_uring.c using the new aio_add_sqe() API that allows user-defined io_uring sqe submission. Now block/io_uring.c just needs to submit readv/writev/fsync and most of the io_uring-specific logic is handled by fdmon-io_uring.c. There are two immediate advantages: 1. Fewer system calls. There is no need to monitor the disk I/O io_uring ring fd from the file descriptor monitoring io_uring instance. Disk I/O completions are now picked up directly. Also, sqes are accumulated in the sq ring until the end of the event loop iteration and there are fewer io_uring_enter(2) syscalls. 2. Less code duplication. Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com> --- include/block/aio.h | 7 - include/block/raw-aio.h | 5 - block/file-posix.c | 38 ++-- block/io_uring.c | 489 ++++++++++------------------------------ stubs/io_uring.c | 32 --- util/async.c | 35 --- util/fdmon-io_uring.c | 6 + block/trace-events | 12 +- stubs/meson.build | 3 - util/trace-events | 4 + 10 files changed, 139 insertions(+), 492 deletions(-) delete mode 100644 stubs/io_uring.c diff --git a/include/block/aio.h b/include/block/aio.h index 95beef28c3..fbb45cca74 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -291,8 +291,6 @@ struct AioContext { struct LinuxAioState *linux_aio; #endif #ifdef CONFIG_LINUX_IO_URING - LuringState *linux_io_uring; - /* State for file descriptor monitoring using Linux io_uring */ struct io_uring fdmon_io_uring; AioHandlerSList submit_list; @@ -597,11 +595,6 @@ struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); /* Return the LinuxAioState bound to this AioContext */ struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); -/* Setup the LuringState bound to this AioContext */ -LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp); - -/* Return the LuringState bound to this AioContext */ -LuringState *aio_get_linux_io_uring(AioContext *ctx); /** * aio_timer_new_with_attrs: * @ctx: the aio context diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h index 6570244496..30e5fc9a9f 100644 --- a/include/block/raw-aio.h +++ b/include/block/raw-aio.h @@ -74,15 +74,10 @@ static inline bool laio_has_fua(void) #endif /* io_uring.c - Linux io_uring implementation */ #ifdef CONFIG_LINUX_IO_URING -LuringState *luring_init(Error **errp); -void luring_cleanup(LuringState *s); - /* luring_co_submit: submit I/O requests in the thread's current AioContext. */ int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, QEMUIOVector *qiov, int type, BdrvRequestFlags flags); -void luring_detach_aio_context(LuringState *s, AioContext *old_context); -void luring_attach_aio_context(LuringState *s, AioContext *new_context); bool luring_has_fua(void); #else static inline bool luring_has_fua(void) diff --git a/block/file-posix.c b/block/file-posix.c index 9b5f08ccb2..d1f1fc3a77 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -755,14 +755,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, } #endif /* !defined(CONFIG_LINUX_AIO) */ -#ifndef CONFIG_LINUX_IO_URING if (s->use_linux_io_uring) { +#ifdef CONFIG_LINUX_IO_URING + if (!aio_has_io_uring()) { + error_setg(errp, "aio=io_uring was specified, but is not " + "available (disabled via io_uring_disabled " + "sysctl or blocked by container runtime " + "seccomp policy?)"); + ret = -EINVAL; + goto fail; + } +#else error_setg(errp, "aio=io_uring was specified, but is not supported " "in this build."); ret = -EINVAL; goto fail; - } #endif /* !defined(CONFIG_LINUX_IO_URING) */ + } s->has_discard = true; s->has_write_zeroes = true; @@ -2522,27 +2531,6 @@ static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -#ifdef CONFIG_LINUX_IO_URING -static inline bool raw_check_linux_io_uring(BDRVRawState *s) -{ - Error *local_err = NULL; - AioContext *ctx; - - if (!s->use_linux_io_uring) { - return false; - } - - ctx = qemu_get_current_aio_context(); - if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) { - error_reportf_err(local_err, "Unable to use linux io_uring, " - "falling back to thread pool: "); - s->use_linux_io_uring = false; - return false; - } - return true; -} -#endif - #ifdef CONFIG_LINUX_AIO static inline bool raw_check_linux_aio(BDRVRawState *s) { @@ -2595,7 +2583,7 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr, if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) { type |= QEMU_AIO_MISALIGNED; #ifdef CONFIG_LINUX_IO_URING - } else if (raw_check_linux_io_uring(s)) { + } else if (s->use_linux_io_uring) { assert(qiov->size == bytes); ret = luring_co_submit(bs, s->fd, offset, qiov, type, flags); goto out; @@ -2692,7 +2680,7 @@ static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs) }; #ifdef CONFIG_LINUX_IO_URING - if (raw_check_linux_io_uring(s)) { + if (s->use_linux_io_uring) { return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH, 0); } #endif diff --git a/block/io_uring.c b/block/io_uring.c index dd4f304910..dd930ee57e 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -11,28 +11,20 @@ #include "qemu/osdep.h" #include <liburing.h> #include "block/aio.h" -#include "qemu/queue.h" #include "block/block.h" #include "block/raw-aio.h" #include "qemu/coroutine.h" -#include "qemu/defer-call.h" -#include "qapi/error.h" #include "system/block-backend.h" #include "trace.h" -/* Only used for assertions. */ -#include "qemu/coroutine_int.h" - -/* io_uring ring size */ -#define MAX_ENTRIES 128 - -typedef struct LuringAIOCB { +typedef struct { Coroutine *co; - struct io_uring_sqe sqeq; - ssize_t ret; QEMUIOVector *qiov; - bool is_read; - QSIMPLEQ_ENTRY(LuringAIOCB) next; + uint64_t offset; + ssize_t ret; + int type; + int fd; + BdrvRequestFlags flags; /* * Buffered reads may require resubmission, see @@ -40,36 +32,51 @@ typedef struct LuringAIOCB { */ int total_read; QEMUIOVector resubmit_qiov; -} LuringAIOCB; -typedef struct LuringQueue { - unsigned int in_queue; - unsigned int in_flight; - bool blocked; - QSIMPLEQ_HEAD(, LuringAIOCB) submit_queue; -} LuringQueue; + CqeHandler cqe_handler; +} LuringRequest; -struct LuringState { - AioContext *aio_context; - - struct io_uring ring; - - /* No locking required, only accessed from AioContext home thread */ - LuringQueue io_q; - - QEMUBH *completion_bh; -}; - -/** - * luring_resubmit: - * - * Resubmit a request by appending it to submit_queue. The caller must ensure - * that ioq_submit() is called later so that submit_queue requests are started. - */ -static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) +static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque) { - QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); - s->io_q.in_queue++; + LuringRequest *req = opaque; + QEMUIOVector *qiov = req->qiov; + uint64_t offset = req->offset; + int fd = req->fd; + BdrvRequestFlags flags = req->flags; + + switch (req->type) { + case QEMU_AIO_WRITE: +#ifdef HAVE_IO_URING_PREP_WRITEV2 + { + int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; + io_uring_prep_writev2(sqe, fd, qiov->iov, + qiov->niov, offset, luring_flags); + } +#else + assert(flags == 0); + io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset); +#endif + break; + case QEMU_AIO_ZONE_APPEND: + io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset); + break; + case QEMU_AIO_READ: + { + if (req->resubmit_qiov.iov != NULL) { + qiov = &req->resubmit_qiov; + } + io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov, + offset + req->total_read); + break; + } + case QEMU_AIO_FLUSH: + io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC); + break; + default: + fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n", + __func__, req->type); + abort(); + } } /** @@ -78,385 +85,115 @@ static void luring_resubmit(LuringState *s, LuringAIOCB *luringcb) * Short reads are rare but may occur. The remaining read request needs to be * resubmitted. */ -static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, - int nread) +static void luring_resubmit_short_read(LuringRequest *req, int nread) { QEMUIOVector *resubmit_qiov; size_t remaining; - trace_luring_resubmit_short_read(s, luringcb, nread); + trace_luring_resubmit_short_read(req, nread); /* Update read position */ - luringcb->total_read += nread; - remaining = luringcb->qiov->size - luringcb->total_read; + req->total_read += nread; + remaining = req->qiov->size - req->total_read; /* Shorten qiov */ - resubmit_qiov = &luringcb->resubmit_qiov; + resubmit_qiov = &req->resubmit_qiov; if (resubmit_qiov->iov == NULL) { - qemu_iovec_init(resubmit_qiov, luringcb->qiov->niov); + qemu_iovec_init(resubmit_qiov, req->qiov->niov); } else { qemu_iovec_reset(resubmit_qiov); } - qemu_iovec_concat(resubmit_qiov, luringcb->qiov, luringcb->total_read, - remaining); + qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining); - /* Update sqe */ - luringcb->sqeq.off += nread; - luringcb->sqeq.addr = (uintptr_t)luringcb->resubmit_qiov.iov; - luringcb->sqeq.len = luringcb->resubmit_qiov.niov; - - luring_resubmit(s, luringcb); + aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler); } -/** - * luring_process_completions: - * @s: AIO state - * - * Fetches completed I/O requests, consumes cqes and invokes their callbacks - * The function is somewhat tricky because it supports nested event loops, for - * example when a request callback invokes aio_poll(). - * - * Function schedules BH completion so it can be called again in a nested - * event loop. When there are no events left to complete the BH is being - * canceled. - * - */ -static void luring_process_completions(LuringState *s) +static void luring_cqe_handler(CqeHandler *cqe_handler) { - struct io_uring_cqe *cqes; - int total_bytes; + LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler); + int ret = cqe_handler->cqe.res; - defer_call_begin(); + trace_luring_cqe_handler(req, ret); - /* - * Request completion callbacks can run the nested event loop. - * Schedule ourselves so the nested event loop will "see" remaining - * completed requests and process them. Without this, completion - * callbacks that wait for other requests using a nested event loop - * would hang forever. - * - * This workaround is needed because io_uring uses poll_wait, which - * is woken up when new events are added to the uring, thus polling on - * the same uring fd will block unless more events are received. - * - * Other leaf block drivers (drivers that access the data themselves) - * are networking based, so they poll sockets for data and run the - * correct coroutine. - */ - qemu_bh_schedule(s->completion_bh); - - while (io_uring_peek_cqe(&s->ring, &cqes) == 0) { - LuringAIOCB *luringcb; - int ret; - - if (!cqes) { - break; + if (ret < 0) { + /* + * Only writev/readv/fsync requests on regular files or host block + * devices are submitted. Therefore -EAGAIN is not expected but it's + * known to happen sometimes with Linux SCSI. Submit again and hope + * the request completes successfully. + * + * For more information, see: + * https://lore.kernel.org/io-uring/20210727165811.284510-3-ax...@kernel.dk/T/#u + * + * If the code is changed to submit other types of requests in the + * future, then this workaround may need to be extended to deal with + * genuine -EAGAIN results that should not be resubmitted + * immediately. + */ + if (ret == -EINTR || ret == -EAGAIN) { + aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler); + return; } - - luringcb = io_uring_cqe_get_data(cqes); - ret = cqes->res; - io_uring_cqe_seen(&s->ring, cqes); - cqes = NULL; - - /* Change counters one-by-one because we can be nested. */ - s->io_q.in_flight--; - trace_luring_process_completion(s, luringcb, ret); - + } else if (req->qiov) { /* total_read is non-zero only for resubmitted read requests */ - total_bytes = ret + luringcb->total_read; + int total_bytes = ret + req->total_read; - if (ret < 0) { - /* - * Only writev/readv/fsync requests on regular files or host block - * devices are submitted. Therefore -EAGAIN is not expected but it's - * known to happen sometimes with Linux SCSI. Submit again and hope - * the request completes successfully. - * - * For more information, see: - * https://lore.kernel.org/io-uring/20210727165811.284510-3-ax...@kernel.dk/T/#u - * - * If the code is changed to submit other types of requests in the - * future, then this workaround may need to be extended to deal with - * genuine -EAGAIN results that should not be resubmitted - * immediately. - */ - if (ret == -EINTR || ret == -EAGAIN) { - luring_resubmit(s, luringcb); - continue; - } - } else if (!luringcb->qiov) { - goto end; - } else if (total_bytes == luringcb->qiov->size) { + if (total_bytes == req->qiov->size) { ret = 0; - /* Only read/write */ } else { /* Short Read/Write */ - if (luringcb->is_read) { + if (req->type == QEMU_AIO_READ) { if (ret > 0) { - luring_resubmit_short_read(s, luringcb, ret); - continue; - } else { - /* Pad with zeroes */ - qemu_iovec_memset(luringcb->qiov, total_bytes, 0, - luringcb->qiov->size - total_bytes); - ret = 0; + luring_resubmit_short_read(req, ret); + return; } + + /* Pad with zeroes */ + qemu_iovec_memset(req->qiov, total_bytes, 0, + req->qiov->size - total_bytes); + ret = 0; } else { ret = -ENOSPC; } } -end: - luringcb->ret = ret; - qemu_iovec_destroy(&luringcb->resubmit_qiov); - - /* - * If the coroutine is already entered it must be in ioq_submit() - * and will notice luringcb->ret has been filled in when it - * eventually runs later. Coroutines cannot be entered recursively - * so avoid doing that! - */ - assert(luringcb->co->ctx == s->aio_context); - if (!qemu_coroutine_entered(luringcb->co)) { - aio_co_wake(luringcb->co); - } } - qemu_bh_cancel(s->completion_bh); + req->ret = ret; + qemu_iovec_destroy(&req->resubmit_qiov); - defer_call_end(); -} - -static int ioq_submit(LuringState *s) -{ - int ret = 0; - LuringAIOCB *luringcb, *luringcb_next; - - while (s->io_q.in_queue > 0) { - /* - * Try to fetch sqes from the ring for requests waiting in - * the overflow queue - */ - QSIMPLEQ_FOREACH_SAFE(luringcb, &s->io_q.submit_queue, next, - luringcb_next) { - struct io_uring_sqe *sqes = io_uring_get_sqe(&s->ring); - if (!sqes) { - break; - } - /* Prep sqe for submission */ - *sqes = luringcb->sqeq; - QSIMPLEQ_REMOVE_HEAD(&s->io_q.submit_queue, next); - } - ret = io_uring_submit(&s->ring); - trace_luring_io_uring_submit(s, ret); - /* Prevent infinite loop if submission is refused */ - if (ret <= 0) { - if (ret == -EAGAIN || ret == -EINTR) { - continue; - } - break; - } - s->io_q.in_flight += ret; - s->io_q.in_queue -= ret; - } - s->io_q.blocked = (s->io_q.in_queue > 0); - - if (s->io_q.in_flight) { - /* - * We can try to complete something just right away if there are - * still requests in-flight. - */ - luring_process_completions(s); - } - return ret; -} - -static void luring_process_completions_and_submit(LuringState *s) -{ - luring_process_completions(s); - - if (s->io_q.in_queue > 0) { - ioq_submit(s); + /* + * If the coroutine is already entered it must be in luring_co_submit() and + * will notice req->ret has been filled in when it eventually runs later. + * Coroutines cannot be entered recursively so avoid doing that! + */ + if (!qemu_coroutine_entered(req->co)) { + aio_co_wake(req->co); } } -static void qemu_luring_completion_bh(void *opaque) +int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, + uint64_t offset, QEMUIOVector *qiov, + int type, BdrvRequestFlags flags) { - LuringState *s = opaque; - luring_process_completions_and_submit(s); -} - -static void qemu_luring_completion_cb(void *opaque) -{ - LuringState *s = opaque; - luring_process_completions_and_submit(s); -} - -static bool qemu_luring_poll_cb(void *opaque) -{ - LuringState *s = opaque; - - return io_uring_cq_ready(&s->ring); -} - -static void qemu_luring_poll_ready(void *opaque) -{ - LuringState *s = opaque; - - luring_process_completions_and_submit(s); -} - -static void ioq_init(LuringQueue *io_q) -{ - QSIMPLEQ_INIT(&io_q->submit_queue); - io_q->in_queue = 0; - io_q->in_flight = 0; - io_q->blocked = false; -} - -static void luring_deferred_fn(void *opaque) -{ - LuringState *s = opaque; - trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue, - s->io_q.in_flight); - if (!s->io_q.blocked && s->io_q.in_queue > 0) { - ioq_submit(s); - } -} - -/** - * luring_do_submit: - * @fd: file descriptor for I/O - * @luringcb: AIO control block - * @s: AIO state - * @offset: offset for request - * @type: type of request - * - * Fetches sqes from ring, adds to pending queue and preps them - * - */ -static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s, - uint64_t offset, int type, BdrvRequestFlags flags) -{ - int ret; - struct io_uring_sqe *sqes = &luringcb->sqeq; - - switch (type) { - case QEMU_AIO_WRITE: -#ifdef HAVE_IO_URING_PREP_WRITEV2 - { - int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0; - io_uring_prep_writev2(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset, luring_flags); - } -#else - assert(flags == 0); - io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); -#endif - break; - case QEMU_AIO_ZONE_APPEND: - io_uring_prep_writev(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); - break; - case QEMU_AIO_READ: - io_uring_prep_readv(sqes, fd, luringcb->qiov->iov, - luringcb->qiov->niov, offset); - break; - case QEMU_AIO_FLUSH: - io_uring_prep_fsync(sqes, fd, IORING_FSYNC_DATASYNC); - break; - default: - fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n", - __func__, type); - abort(); - } - io_uring_sqe_set_data(sqes, luringcb); - - QSIMPLEQ_INSERT_TAIL(&s->io_q.submit_queue, luringcb, next); - s->io_q.in_queue++; - trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue, - s->io_q.in_flight); - if (!s->io_q.blocked) { - if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) { - ret = ioq_submit(s); - trace_luring_do_submit_done(s, ret); - return ret; - } - - defer_call(luring_deferred_fn, s); - } - return 0; -} - -int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd, uint64_t offset, - QEMUIOVector *qiov, int type, - BdrvRequestFlags flags) -{ - int ret; - AioContext *ctx = qemu_get_current_aio_context(); - LuringState *s = aio_get_linux_io_uring(ctx); - LuringAIOCB luringcb = { + LuringRequest req = { .co = qemu_coroutine_self(), - .ret = -EINPROGRESS, .qiov = qiov, - .is_read = (type == QEMU_AIO_READ), + .ret = -EINPROGRESS, + .type = type, + .fd = fd, + .offset = offset, + .flags = flags, }; - trace_luring_co_submit(bs, s, &luringcb, fd, offset, qiov ? qiov->size : 0, - type); - ret = luring_do_submit(fd, &luringcb, s, offset, type, flags); - if (ret < 0) { - return ret; - } + req.cqe_handler.cb = luring_cqe_handler; - if (luringcb.ret == -EINPROGRESS) { + trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type); + aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler); + + if (req.ret == -EINPROGRESS) { qemu_coroutine_yield(); } - return luringcb.ret; -} - -void luring_detach_aio_context(LuringState *s, AioContext *old_context) -{ - aio_set_fd_handler(old_context, s->ring.ring_fd, - NULL, NULL, NULL, NULL, s); - qemu_bh_delete(s->completion_bh); - s->aio_context = NULL; -} - -void luring_attach_aio_context(LuringState *s, AioContext *new_context) -{ - s->aio_context = new_context; - s->completion_bh = aio_bh_new(new_context, qemu_luring_completion_bh, s); - aio_set_fd_handler(s->aio_context, s->ring.ring_fd, - qemu_luring_completion_cb, NULL, - qemu_luring_poll_cb, qemu_luring_poll_ready, s); -} - -LuringState *luring_init(Error **errp) -{ - int rc; - LuringState *s = g_new0(LuringState, 1); - struct io_uring *ring = &s->ring; - - trace_luring_init_state(s, sizeof(*s)); - - rc = io_uring_queue_init(MAX_ENTRIES, ring, 0); - if (rc < 0) { - error_setg_errno(errp, -rc, "failed to init linux io_uring ring"); - g_free(s); - return NULL; - } - - ioq_init(&s->io_q); - return s; - -} - -void luring_cleanup(LuringState *s) -{ - io_uring_queue_exit(&s->ring); - trace_luring_cleanup_state(s); - g_free(s); + return req.ret; } bool luring_has_fua(void) diff --git a/stubs/io_uring.c b/stubs/io_uring.c deleted file mode 100644 index 622d1e4648..0000000000 --- a/stubs/io_uring.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Linux io_uring support. - * - * Copyright (C) 2009 IBM, Corp. - * Copyright (C) 2009 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ -#include "qemu/osdep.h" -#include "block/aio.h" -#include "block/raw-aio.h" - -void luring_detach_aio_context(LuringState *s, AioContext *old_context) -{ - abort(); -} - -void luring_attach_aio_context(LuringState *s, AioContext *new_context) -{ - abort(); -} - -LuringState *luring_init(Error **errp) -{ - abort(); -} - -void luring_cleanup(LuringState *s) -{ - abort(); -} diff --git a/util/async.c b/util/async.c index bba9622e97..d66575acd2 100644 --- a/util/async.c +++ b/util/async.c @@ -383,14 +383,6 @@ aio_ctx_finalize(GSource *source) } #endif -#ifdef CONFIG_LINUX_IO_URING - if (ctx->linux_io_uring) { - luring_detach_aio_context(ctx->linux_io_uring, ctx); - luring_cleanup(ctx->linux_io_uring); - ctx->linux_io_uring = NULL; - } -#endif - assert(QSLIST_EMPTY(&ctx->scheduled_coroutines)); qemu_bh_delete(ctx->co_schedule_bh); @@ -465,29 +457,6 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) } #endif -#ifdef CONFIG_LINUX_IO_URING -LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) -{ - if (ctx->linux_io_uring) { - return ctx->linux_io_uring; - } - - ctx->linux_io_uring = luring_init(errp); - if (!ctx->linux_io_uring) { - return NULL; - } - - luring_attach_aio_context(ctx->linux_io_uring, ctx); - return ctx->linux_io_uring; -} - -LuringState *aio_get_linux_io_uring(AioContext *ctx) -{ - assert(ctx->linux_io_uring); - return ctx->linux_io_uring; -} -#endif - void aio_notify(AioContext *ctx) { /* @@ -611,10 +580,6 @@ AioContext *aio_context_new(Error **errp) ctx->linux_aio = NULL; #endif -#ifdef CONFIG_LINUX_IO_URING - ctx->linux_io_uring = NULL; -#endif - ctx->thread_pool = NULL; qemu_rec_mutex_init(&ctx->lock); timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c index 03a07a4caf..2c64f80e5f 100644 --- a/util/fdmon-io_uring.c +++ b/util/fdmon-io_uring.c @@ -48,6 +48,7 @@ #include "qapi/error.h" #include "qemu/rcu_queue.h" #include "aio-posix.h" +#include "trace.h" enum { FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ @@ -174,6 +175,9 @@ static void fdmon_io_uring_add_sqe(AioContext *ctx, prep_sqe(sqe, opaque); io_uring_sqe_set_data(sqe, cqe_handler); + + trace_fdmon_io_uring_add_sqe(ctx, opaque, sqe->opcode, sqe->fd, sqe->off, + cqe_handler); } static void fdmon_special_cqe_handler(CqeHandler *cqe_handler) @@ -290,6 +294,8 @@ static void cqe_handler_bh(void *opaque) QSIMPLEQ_REMOVE_HEAD(ready_list, next); + trace_fdmon_io_uring_cqe_handler(ctx, cqe_handler, + cqe_handler->cqe.res); cqe_handler->cb(cqe_handler); } diff --git a/block/trace-events b/block/trace-events index 8e789e1f12..c9b4736ff8 100644 --- a/block/trace-events +++ b/block/trace-events @@ -62,15 +62,9 @@ qmp_block_stream(void *bs) "bs %p" file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" # io_uring.c -luring_init_state(void *s, size_t size) "s %p size %zu" -luring_cleanup_state(void *s) "%p freed" -luring_unplug_fn(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" -luring_do_submit(void *s, int blocked, int queued, int inflight) "LuringState %p blocked %d queued %d inflight %d" -luring_do_submit_done(void *s, int ret) "LuringState %p submitted to kernel %d" -luring_co_submit(void *bs, void *s, void *luringcb, int fd, uint64_t offset, size_t nbytes, int type) "bs %p s %p luringcb %p fd %d offset %" PRId64 " nbytes %zd type %d" -luring_process_completion(void *s, void *aiocb, int ret) "LuringState %p luringcb %p ret %d" -luring_io_uring_submit(void *s, int ret) "LuringState %p ret %d" -luring_resubmit_short_read(void *s, void *luringcb, int nread) "LuringState %p luringcb %p nread %d" +luring_cqe_handler(void *req, int ret) "req %p ret %d" +luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d" +luring_resubmit_short_read(void *req, int nread) "req %p nread %d" # qcow2.c qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu" diff --git a/stubs/meson.build b/stubs/meson.build index 63392f5e78..d157b06273 100644 --- a/stubs/meson.build +++ b/stubs/meson.build @@ -32,9 +32,6 @@ if have_block or have_ga stub_ss.add(files('cpus-virtual-clock.c')) stub_ss.add(files('icount.c')) stub_ss.add(files('graph-lock.c')) - if linux_io_uring.found() - stub_ss.add(files('io_uring.c')) - endif if libaio.found() stub_ss.add(files('linux-aio.c')) endif diff --git a/util/trace-events b/util/trace-events index bd8f25fb59..540d662507 100644 --- a/util/trace-events +++ b/util/trace-events @@ -24,6 +24,10 @@ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s" buffer_free(const char *buf, size_t len) "%s: capacity %zd" +# fdmon-io_uring.c +fdmon_io_uring_add_sqe(void *ctx, void *opaque, int opcode, int fd, uint64_t off, void *cqe_handler) "ctx %p opaque %p opcode %d fd %d off %"PRId64" cqe_handler %p" +fdmon_io_uring_cqe_handler(void *ctx, void *cqe_handler, int cqe_res) "ctx %p cqe_handler %p cqe_res %d" + # filemonitor-inotify.c qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64 qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64 -- 2.49.0