On Fri, 28 Nov 2025 at 15:47, Paolo Bonzini <[email protected]> wrote: > Rust bindings are roughly broken up according to subdirectories of > include/ (that's not exact, but it's roughly an aim). However, > block/aio.h contains both block layer-specific concepts (BlockAIOCB, > BlockCompletionFunc) and AioContext-related declarations that are > used be qemu/main-loop.h. > > Break out the latter into their own header file, and use that to > break the inclusion of block/ from qemu/main-loop.h. > > Signed-off-by: Paolo Bonzini <[email protected]> > --- > Based on top of > > https://lore.kernel.org/qemu-devel/[email protected]/ > > include/block/aio.h | 838 +------------------------------------- > include/qemu/aio.h | 852 +++++++++++++++++++++++++++++++++++++++ > include/qemu/main-loop.h | 4 +- > 3 files changed, 857 insertions(+), 837 deletions(-) > create mode 100644 include/qemu/aio.h > > diff --git a/include/block/aio.h b/include/block/aio.h > index cc3d5f25a24..dba423f896e 100644 > --- a/include/block/aio.h > +++ b/include/block/aio.h > @@ -11,22 +11,13 @@ > * > */ > > -#ifndef QEMU_AIO_H > -#define QEMU_AIO_H > +#ifndef QEMU_BLOCK_AIO_H > +#define QEMU_BLOCK_AIO_H > > -#ifdef CONFIG_LINUX_IO_URING > -#include <liburing.h> > -#endif > -#include "qemu/coroutine-core.h" > -#include "qemu/queue.h" > -#include "qemu/event_notifier.h" > -#include "qemu/lockcnt.h" > -#include "qemu/thread.h" > -#include "qemu/timer.h" > +#include "qemu/aio.h" > #include "block/graph-lock.h" > #include "hw/core/qdev.h" > > - > typedef struct BlockAIOCB BlockAIOCB; > typedef void BlockCompletionFunc(void *opaque, int ret); > > @@ -48,827 +39,4 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, > BlockDriverState *bs, > void qemu_aio_unref(void *p); > void qemu_aio_ref(void *p); > > -typedef struct AioHandler AioHandler; > -typedef QLIST_HEAD(, AioHandler) AioHandlerList; > -typedef void QEMUBHFunc(void *opaque); > -typedef bool AioPollFn(void *opaque); > -typedef void IOHandler(void *opaque); > - > -struct ThreadPoolAio; > -struct LinuxAioState; > -typedef struct LuringState LuringState; > - > -/* Is polling disabled? */ > -bool aio_poll_disabled(AioContext *ctx); > - > -#ifdef CONFIG_LINUX_IO_URING > -/* > - * Each io_uring request must have a unique CqeHandler that processes the > cqe. > - * The lifetime of a CqeHandler must be at least from aio_add_sqe() until > - * ->cb() invocation. > - */ > -typedef struct CqeHandler CqeHandler; > -struct CqeHandler { > - /* Called by the AioContext when the request has completed */ > - void (*cb)(CqeHandler *handler); > - > - /* Used internally, do not access this */ > - QSIMPLEQ_ENTRY(CqeHandler) next; > - > - /* This field is filled in before ->cb() is called */ > - struct io_uring_cqe cqe; > -}; > - > -typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ; > -#endif /* CONFIG_LINUX_IO_URING */ > - > -/* Callbacks for file descriptor monitoring implementations */ > -typedef struct { > - /* > - * update: > - * @ctx: the AioContext > - * @old_node: the existing handler or NULL if this file descriptor is > being > - * monitored for the first time > - * @new_node: the new handler or NULL if this file descriptor is being > - * removed > - * > - * Add/remove/modify a monitored file descriptor. > - * > - * Called with ctx->list_lock acquired. > - */ > - void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler > *new_node); > - > - /* > - * wait: > - * @ctx: the AioContext > - * @ready_list: list for handlers that become ready > - * @timeout: maximum duration to wait, in nanoseconds > - * > - * Wait for file descriptors to become ready and place them on > ready_list. > - * > - * Called with ctx->list_lock incremented but not locked. > - * > - * Returns: number of ready file descriptors. > - */ > - int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t > timeout); > - > - /* > - * need_wait: > - * @ctx: the AioContext > - * > - * Tell aio_poll() when to stop userspace polling early because ->wait() > - * has fds ready. > - * > - * File descriptor monitoring implementations that cannot poll fd > readiness > - * from userspace should use aio_poll_disabled() here. This ensures that > - * file descriptors are not starved by handlers that frequently make > - * progress via userspace polling. > - * > - * Returns: true if ->wait() should be called, false otherwise. > - */ > - bool (*need_wait)(AioContext *ctx); > - > - /* > - * dispatch: > - * @ctx: the AioContext > - * > - * Dispatch any work that is specific to this file descriptor monitoring > - * implementation. Usually the event loop's generic file descriptor > - * monitoring, BH, and timer dispatching code is sufficient, but file > - * descriptor monitoring implementations offering additional > functionality > - * may need to implement this function for custom behavior. Called at a > - * point in the event loop when it is safe to invoke user-defined > - * callbacks. > - * > - * This function is optional and may be NULL. > - * > - * Returns: true if progress was made (see aio_poll()'s return value), > - * false otherwise. > - */ > - bool (*dispatch)(AioContext *ctx); > - > - /* > - * gsource_prepare: > - * @ctx: the AioContext > - * > - * Prepare for the glib event loop to wait for events instead of the > usual > - * ->wait() call. See glib's GSourceFuncs->prepare(). > - */ > - void (*gsource_prepare)(AioContext *ctx); > - > - /* > - * gsource_check: > - * @ctx: the AioContext > - * > - * Called by the glib event loop from glib's GSourceFuncs->check() after > - * waiting for events. > - * > - * Returns: true when ready to be dispatched. > - */ > - bool (*gsource_check)(AioContext *ctx); > - > - /* > - * gsource_dispatch: > - * @ctx: the AioContext > - * @ready_list: list for handlers that become ready > - * > - * Place ready AioHandlers on ready_list. Called as part of the glib > event > - * loop from glib's GSourceFuncs->dispatch(). > - * > - * Called with list_lock incremented. > - */ > - void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list); > - > -#ifdef CONFIG_LINUX_IO_URING > - /** > - * add_sqe: Add an io_uring sqe for submission. > - * @prep_sqe: invoked with an sqe that should be prepared for submission > - * @opaque: user-defined argument to @prep_sqe() > - * @cqe_handler: the unique cqe handler associated with this request > - * > - * The caller's @prep_sqe() function is invoked to fill in the details of > - * the sqe. Do not call io_uring_sqe_set_data() on this sqe. > - * > - * The kernel may see the sqe as soon as @prep_sqe() returns or it may > take > - * until the next event loop iteration. > - * > - * This function is called from the current AioContext and is not > - * thread-safe. > - */ > - void (*add_sqe)(AioContext *ctx, > - void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), > - void *opaque, CqeHandler *cqe_handler); > -#endif /* CONFIG_LINUX_IO_URING */ > -} FDMonOps; > - > -/* > - * Each aio_bh_poll() call carves off a slice of the BH list, so that newly > - * scheduled BHs are not processed until the next aio_bh_poll() call. All > - * active aio_bh_poll() calls chain their slices together in a list, so that > - * nested aio_bh_poll() calls process all scheduled bottom halves. > - */ > -typedef QSLIST_HEAD(, QEMUBH) BHList; > -typedef struct BHListSlice BHListSlice; > -struct BHListSlice { > - BHList bh_list; > - QSIMPLEQ_ENTRY(BHListSlice) next; > -}; > - > -typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; > - > -typedef struct AioPolledEvent { > - int64_t ns; /* current polling time in nanoseconds */ > -} AioPolledEvent; > - > -struct AioContext { > - GSource source; > - > - /* Used by AioContext users to protect from multi-threaded access. */ > - QemuRecMutex lock; > - > - /* > - * Keep track of readers and writers of the block layer graph. > - * This is essential to avoid performing additions and removal > - * of nodes and edges from block graph while some > - * other thread is traversing it. > - */ > - BdrvGraphRWlock *bdrv_graph; > - > - /* The list of registered AIO handlers. Protected by ctx->list_lock. */ > - AioHandlerList aio_handlers; > - > - /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. > */ > - AioHandlerList deleted_aio_handlers; > - > - /* Used to avoid unnecessary event_notifier_set calls in aio_notify; > - * only written from the AioContext home thread, or under the BQL in > - * the case of the main AioContext. However, it is read from any > - * thread so it is still accessed with atomic primitives. > - * > - * If this field is 0, everything (file descriptors, bottom halves, > - * timers) will be re-evaluated before the next blocking poll() or > - * io_uring wait; therefore, the event_notifier_set call can be > - * skipped. If it is non-zero, you may need to wake up a concurrent > - * aio_poll or the glib main event loop, making event_notifier_set > - * necessary. > - * > - * Bit 0 is reserved for GSource usage of the AioContext, and is 1 > - * between a call to aio_ctx_prepare and the next call to aio_ctx_check. > - * Bits 1-31 simply count the number of active calls to aio_poll > - * that are in the prepare or poll phase. > - * > - * The GSource and aio_poll must use a different mechanism because > - * there is no certainty that a call to GSource's prepare callback > - * (via g_main_context_prepare) is indeed followed by check and > - * dispatch. It's not clear whether this would be a bug, but let's > - * play safe and allow it---it will just cause extra calls to > - * event_notifier_set until the next call to dispatch. > - * > - * Instead, the aio_poll calls include both the prepare and the > - * dispatch phase, hence a simple counter is enough for them. > - */ > - uint32_t notify_me; > - > - /* A lock to protect between QEMUBH and AioHandler adders and deleter, > - * and to ensure that no callbacks are removed while we're walking and > - * dispatching them. > - */ > - QemuLockCnt list_lock; > - > - /* Bottom Halves pending aio_bh_poll() processing */ > - BHList bh_list; > - > - /* Chained BH list slices for each nested aio_bh_poll() call */ > - QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list; > - > - /* Used by aio_notify. > - * > - * "notified" is used to avoid expensive event_notifier_test_and_clear > - * calls. When it is clear, the EventNotifier is clear, or one thread > - * is going to clear "notified" before processing more events. False > - * positives are possible, i.e. "notified" could be set even though the > - * EventNotifier is clear. > - * > - * Note that event_notifier_set *cannot* be optimized the same way. For > - * more information on the problem that would result, see "#ifdef BUG2" > - * in the docs/aio_notify_accept.promela formal model. > - */ > - bool notified; > - EventNotifier notifier; > - > - QSLIST_HEAD(, Coroutine) scheduled_coroutines; > - QEMUBH *co_schedule_bh; > - > - int thread_pool_min; > - int thread_pool_max; > - /* Thread pool for performing work and receiving completion callbacks. > - * Has its own locking. > - */ > - struct ThreadPoolAio *thread_pool; > - > -#ifdef CONFIG_LINUX_AIO > - struct LinuxAioState *linux_aio; > -#endif > -#ifdef CONFIG_LINUX_IO_URING > - /* State for file descriptor monitoring using Linux io_uring */ > - struct io_uring fdmon_io_uring; > - AioHandlerSList submit_list; > - void *io_uring_fd_tag; > - > - /* Pending callback state for cqe handlers */ > - CqeHandlerSimpleQ cqe_handler_ready_list; > -#endif /* CONFIG_LINUX_IO_URING */ > - > - /* TimerLists for calling timers - one per clock type. Has its own > - * locking. > - */ > - QEMUTimerListGroup tlg; > - > - /* Number of AioHandlers without .io_poll() */ > - int poll_disable_cnt; > - > - /* Polling mode parameters */ > - int64_t poll_max_ns; /* maximum polling time in nanoseconds */ > - int64_t poll_grow; /* polling time growth factor */ > - int64_t poll_shrink; /* polling time shrink factor */ > - > - /* AIO engine parameters */ > - int64_t aio_max_batch; /* maximum number of requests in a batch */ > - > - /* > - * List of handlers participating in userspace polling. Protected by > - * ctx->list_lock. Iterated and modified mostly by the event loop thread > - * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() > - * only touches the list to delete nodes if ctx->list_lock's count is > zero. > - */ > - AioHandlerList poll_aio_handlers; > - > - /* Are we in polling mode or monitoring file descriptors? */ > - bool poll_started; > - > - /* epoll(7) state used when built with CONFIG_EPOLL */ > - int epollfd; > - > - /* The GSource unix fd tag for epollfd */ > - void *epollfd_tag; > - > - const FDMonOps *fdmon_ops; > - > - /* Was aio_context_new() successful? */ > - bool initialized; > -}; > - > -/** > - * aio_context_new: Allocate a new AioContext. > - * > - * AioContext provide a mini event-loop that can be waited on synchronously. > - * They also provide bottom halves, a service to execute a piece of code > - * as soon as possible. > - */ > -AioContext *aio_context_new(Error **errp); > - > -/** > - * aio_context_ref: > - * @ctx: The AioContext to operate on. > - * > - * Add a reference to an AioContext. > - */ > -void aio_context_ref(AioContext *ctx); > - > -/** > - * aio_context_unref: > - * @ctx: The AioContext to operate on. > - * > - * Drop a reference to an AioContext. > - */ > -void aio_context_unref(AioContext *ctx); > - > -/** > - * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that > will > - * run only once and as soon as possible. > - * > - * @name: A human-readable identifier for debugging purposes. > - */ > -void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void > *opaque, > - const char *name); > - > -/** > - * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will > run > - * only once and as soon as possible. > - * > - * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as > the > - * name string. > - */ > -#define aio_bh_schedule_oneshot(ctx, cb, opaque) \ > - aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb))) > - > -/** > - * aio_bh_new_full: Allocate a new bottom half structure. > - * > - * Bottom halves are lightweight callbacks whose invocation is guaranteed > - * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure > - * is opaque and must be allocated prior to its use. > - * > - * @name: A human-readable identifier for debugging purposes. > - * @reentrancy_guard: A guard set when entering a cb to prevent > - * device-reentrancy issues > - */ > -QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, > - const char *name, MemReentrancyGuard > *reentrancy_guard); > - > -/** > - * aio_bh_new: Allocate a new bottom half structure > - * > - * A convenience wrapper for aio_bh_new_full() that uses the cb as the name > - * string. > - */ > -#define aio_bh_new(ctx, cb, opaque) \ > - aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL) > - > -/** > - * aio_bh_new_guarded: Allocate a new bottom half structure with a > - * reentrancy_guard > - * > - * A convenience wrapper for aio_bh_new_full() that uses the cb as the name > - * string. > - */ > -#define aio_bh_new_guarded(ctx, cb, opaque, guard) \ > - aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard) > - > -/** > - * aio_notify: Force processing of pending events. > - * > - * Similar to signaling a condition variable, aio_notify forces > - * aio_poll to exit, so that the next call will re-examine pending events. > - * The caller of aio_notify will usually call aio_poll again very soon, > - * or go through another iteration of the GLib main loop. Hence, aio_notify > - * also has the side effect of recalculating the sets of file descriptors > - * that the main loop waits for. > - * > - * Calling aio_notify is rarely necessary, because for example scheduling > - * a bottom half calls it already. > - */ > -void aio_notify(AioContext *ctx); > - > -/** > - * aio_notify_accept: Acknowledge receiving an aio_notify. > - * > - * aio_notify() uses an EventNotifier in order to wake up a sleeping > - * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are > - * usually rare, but the AioContext has to clear the EventNotifier on > - * every aio_poll() or g_main_context_iteration() in order to avoid > - * busy waiting. This event_notifier_test_and_clear() cannot be done > - * using the usual aio_context_set_event_notifier(), because it must > - * be done before processing all events (file descriptors, bottom halves, > - * timers). > - * > - * aio_notify_accept() is an optimized event_notifier_test_and_clear() > - * that is specific to an AioContext's notifier; it is used internally > - * to clear the EventNotifier only if aio_notify() had been called. > - */ > -void aio_notify_accept(AioContext *ctx); > - > -/** > - * aio_bh_call: Executes callback function of the specified BH. > - */ > -void aio_bh_call(QEMUBH *bh); > - > -/** > - * aio_bh_poll: Poll bottom halves for an AioContext. > - * > - * These are internal functions used by the QEMU main loop. > - * And notice that multiple occurrences of aio_bh_poll cannot > - * be called concurrently > - */ > -int aio_bh_poll(AioContext *ctx); > - > -/** > - * qemu_bh_schedule: Schedule a bottom half. > - * > - * Scheduling a bottom half interrupts the main loop and causes the > - * execution of the callback that was passed to qemu_bh_new. > - * > - * Bottom halves that are scheduled from a bottom half handler are instantly > - * invoked. This can create an infinite loop if a bottom half handler > - * schedules itself. > - * > - * @bh: The bottom half to be scheduled. > - */ > -void qemu_bh_schedule(QEMUBH *bh); > - > -/** > - * qemu_bh_cancel: Cancel execution of a bottom half. > - * > - * Canceling execution of a bottom half undoes the effect of calls to > - * qemu_bh_schedule without freeing its resources yet. While cancellation > - * itself is also wait-free and thread-safe, it can of course race with the > - * loop that executes bottom halves unless you are holding the iothread > - * mutex. This makes it mostly useless if you are not holding the mutex. > - * > - * @bh: The bottom half to be canceled. > - */ > -void qemu_bh_cancel(QEMUBH *bh); > - > -/** > - *qemu_bh_delete: Cancel execution of a bottom half and free its resources. > - * > - * Deleting a bottom half frees the memory that was allocated for it by > - * qemu_bh_new. It also implies canceling the bottom half if it was > - * scheduled. > - * This func is async. The bottom half will do the delete action at the > finial > - * end. > - * > - * @bh: The bottom half to be deleted. > - */ > -void qemu_bh_delete(QEMUBH *bh); > - > -/* Return whether there are any pending callbacks from the GSource > - * attached to the AioContext, before g_poll is invoked. > - * > - * This is used internally in the implementation of the GSource. > - */ > -bool aio_prepare(AioContext *ctx); > - > -/* Return whether there are any pending callbacks from the GSource > - * attached to the AioContext, after g_poll is invoked. > - * > - * This is used internally in the implementation of the GSource. > - */ > -bool aio_pending(AioContext *ctx); > - > -/* Dispatch any pending callbacks from the GSource attached to the > AioContext. > - * > - * This is used internally in the implementation of the GSource. > - */ > -void aio_dispatch(AioContext *ctx); > - > -/* Progress in completing AIO work to occur. This can issue new pending > - * aio as a result of executing I/O completion or bh callbacks. > - * > - * Return whether any progress was made by executing AIO or bottom half > - * handlers. If @blocking == true, this should always be true except > - * if someone called aio_notify. > - * > - * If there are no pending bottom halves, but there are pending AIO > - * operations, it may not be possible to make any progress without > - * blocking. If @blocking is true, this function will wait until one > - * or more AIO events have completed, to ensure something has moved > - * before returning. > - */ > -bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking); > - > -/* Register a file descriptor and associated callbacks. Behaves very > similarly > - * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will > - * be invoked when using aio_poll(). > - * > - * Code that invokes AIO completion functions should rely on this function > - * instead of qemu_set_fd_handler[2]. > - */ > -void aio_set_fd_handler(AioContext *ctx, > - int fd, > - IOHandler *io_read, > - IOHandler *io_write, > - AioPollFn *io_poll, > - IOHandler *io_poll_ready, > - void *opaque); > - > -/* Register an event notifier and associated callbacks. Behaves very > similarly > - * to event_notifier_set_handler. Unlike event_notifier_set_handler, these > callbacks > - * will be invoked when using aio_poll(). > - * > - * Code that invokes AIO completion functions should rely on this function > - * instead of event_notifier_set_handler. > - */ > -void aio_set_event_notifier(AioContext *ctx, > - EventNotifier *notifier, > - EventNotifierHandler *io_read, > - AioPollFn *io_poll, > - EventNotifierHandler *io_poll_ready); > - > -/* > - * Set polling begin/end callbacks for an event notifier that has already > been > - * registered with aio_set_event_notifier. Do nothing if the event notifier > is > - * not registered. > - * > - * Note that if the io_poll_end() callback (or the entire notifier) is > removed > - * during polling, it will not be called, so an io_poll_begin() is not > - * necessarily always followed by an io_poll_end(). > - */ > -void aio_set_event_notifier_poll(AioContext *ctx, > - EventNotifier *notifier, > - EventNotifierHandler *io_poll_begin, > - EventNotifierHandler *io_poll_end); > - > -/* Return a GSource that lets the main loop poll the file descriptors > attached > - * to this AioContext. > - */ > -GSource *aio_get_g_source(AioContext *ctx); > - > -/* Return the ThreadPoolAio bound to this AioContext */ > -struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx); > - > -/* Setup the LinuxAioState bound to this AioContext */ > -struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); > - > -/* Return the LinuxAioState bound to this AioContext */ > -struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); > - > -/** > - * aio_timer_new_with_attrs: > - * @ctx: the aio context > - * @type: the clock type > - * @scale: the scale > - * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values > - * to assign > - * @cb: the callback to call on timer expiry > - * @opaque: the opaque pointer to pass to the callback > - * > - * Allocate a new timer (with attributes) attached to the context @ctx. > - * The function is responsible for memory allocation. > - * > - * The preferred interface is aio_timer_init or aio_timer_init_with_attrs. > - * Use that unless you really need dynamic memory allocation. > - * > - * Returns: a pointer to the new timer > - */ > -static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx, > - QEMUClockType type, > - int scale, int attributes, > - QEMUTimerCB *cb, void > *opaque) > -{ > - return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque); > -} > - > -/** > - * aio_timer_new: > - * @ctx: the aio context > - * @type: the clock type > - * @scale: the scale > - * @cb: the callback to call on timer expiry > - * @opaque: the opaque pointer to pass to the callback > - * > - * Allocate a new timer attached to the context @ctx. > - * See aio_timer_new_with_attrs for details. > - * > - * Returns: a pointer to the new timer > - */ > -static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type, > - int scale, > - QEMUTimerCB *cb, void *opaque) > -{ > - return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque); > -} > - > -/** > - * aio_timer_init_with_attrs: > - * @ctx: the aio context > - * @ts: the timer > - * @type: the clock type > - * @scale: the scale > - * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values > - * to assign > - * @cb: the callback to call on timer expiry > - * @opaque: the opaque pointer to pass to the callback > - * > - * Initialise a new timer (with attributes) attached to the context @ctx. > - * The caller is responsible for memory allocation. > - */ > -static inline void aio_timer_init_with_attrs(AioContext *ctx, > - QEMUTimer *ts, QEMUClockType > type, > - int scale, int attributes, > - QEMUTimerCB *cb, void *opaque) > -{ > - timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque); > -} > - > -/** > - * aio_timer_init: > - * @ctx: the aio context > - * @ts: the timer > - * @type: the clock type > - * @scale: the scale > - * @cb: the callback to call on timer expiry > - * @opaque: the opaque pointer to pass to the callback > - * > - * Initialise a new timer attached to the context @ctx. > - * See aio_timer_init_with_attrs for details. > - */ > -static inline void aio_timer_init(AioContext *ctx, > - QEMUTimer *ts, QEMUClockType type, > - int scale, > - QEMUTimerCB *cb, void *opaque) > -{ > - timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque); > -} > - > -/** > - * aio_compute_timeout: > - * @ctx: the aio context > - * > - * Compute the timeout that a blocking aio_poll should use. > - */ > -int64_t aio_compute_timeout(AioContext *ctx); > - > -/** > - * aio_co_schedule: > - * @ctx: the aio context > - * @co: the coroutine > - * > - * Start a coroutine on a remote AioContext. > - * > - * The coroutine must not be entered by anyone else while aio_co_schedule() > - * is active. In addition the coroutine must have yielded unless ctx > - * is the context in which the coroutine is running (i.e. the value of > - * qemu_get_current_aio_context() from the coroutine itself). > - */ > -void aio_co_schedule(AioContext *ctx, Coroutine *co); > - > -/** > - * aio_co_reschedule_self: > - * @new_ctx: the new context > - * > - * Move the currently running coroutine to new_ctx. If the coroutine is > already > - * running in new_ctx, do nothing. > - * > - * Note that this function cannot reschedule from iohandler_ctx to > - * qemu_aio_context. > - */ > -void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); > - > -/** > - * aio_co_wake: > - * @co: the coroutine > - * > - * Restart a coroutine on the AioContext where it was running last, thus > - * preventing coroutines from jumping from one context to another when they > - * go to sleep. > - * > - * aio_co_wake may be executed either in coroutine or non-coroutine > - * context. The coroutine must not be entered by anyone else while > - * aio_co_wake() is active. > - * > - * If `co`'s AioContext differs from the current AioContext, this will call > - * aio_co_schedule(), which makes this safe to use even when `co` has not > - * yielded yet. In such a case, it will be entered once it yields. > - * > - * In contrast, if `co`'s AioContext is equal to the current one, it is > - * required for `co` to currently be yielding. This is generally the case > - * if the caller is not in `co` (i.e. invoked by `co`), because the only > - * other way for the caller to be running then is for `co` to currently be > - * yielding. > - * > - * Therefore, if there is no way for the caller to be invoked/entered by > - * `co`, it is generally safe to call this regardless of whether `co` is > - * known to already be yielding or not -- it only has to yield at some > - * point. > - */ > -void aio_co_wake(Coroutine *co); > - > -/** > - * aio_co_enter: > - * @ctx: the context to run the coroutine > - * @co: the coroutine to run > - * > - * Enter a coroutine in the specified AioContext. > - */ > -void aio_co_enter(AioContext *ctx, Coroutine *co); > - > -/** > - * Return the AioContext whose event loop runs in the current thread. > - * > - * If called from an IOThread this will be the IOThread's AioContext. If > - * called from the main thread or with the "big QEMU lock" taken it > - * will be the main loop AioContext. > - * > - * Note that the return value is never the main loop's iohandler_ctx and the > - * return value is the main loop AioContext instead. > - */ > -AioContext *qemu_get_current_aio_context(void); > - > -void qemu_set_current_aio_context(AioContext *ctx); > - > -/** > - * aio_context_setup: > - * @ctx: the aio context > - * @errp: error pointer > - * > - * Initialize the aio context. > - * > - * Returns: true on success, false otherwise > - */ > -bool aio_context_setup(AioContext *ctx, Error **errp); > - > -/** > - * aio_context_destroy: > - * @ctx: the aio context > - * > - * Destroy the aio context. > - */ > -void aio_context_destroy(AioContext *ctx); > - > -/** > - * aio_context_set_poll_params: > - * @ctx: the aio context > - * @max_ns: how long to busy poll for, in nanoseconds > - * @grow: polling time growth factor > - * @shrink: polling time shrink factor > - * > - * Poll mode can be disabled by setting poll_max_ns to 0. > - */ > -void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, > - int64_t grow, int64_t shrink, > - Error **errp); > - > -/** > - * aio_context_set_aio_params: > - * @ctx: the aio context > - * @max_batch: maximum number of requests in a batch, 0 means that the > - * engine will use its default > - */ > -void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch); > - > -/** > - * aio_context_set_thread_pool_params: > - * @ctx: the aio context > - * @min: min number of threads to have readily available in the thread pool > - * @min: max number of threads the thread pool can contain > - */ > -void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, > - int64_t max, Error **errp); > - > -#ifdef CONFIG_LINUX_IO_URING > -/** > - * aio_has_io_uring: Return whether io_uring is available. > - * > - * io_uring is either available in all AioContexts or in none, so this only > - * needs to be called once from within any thread's AioContext. > - */ > -static inline bool aio_has_io_uring(void) > -{ > - AioContext *ctx = qemu_get_current_aio_context(); > - return ctx->fdmon_ops->add_sqe; > -} > - > -/** > - * aio_add_sqe: Add an io_uring sqe for submission. > - * @prep_sqe: invoked with an sqe that should be prepared for submission > - * @opaque: user-defined argument to @prep_sqe() > - * @cqe_handler: the unique cqe handler associated with this request > - * > - * The caller's @prep_sqe() function is invoked to fill in the details of the > - * sqe. Do not call io_uring_sqe_set_data() on this sqe. > - * > - * The sqe is submitted by the current AioContext. The kernel may see the sqe > - * as soon as @prep_sqe() returns or it may take until the next event loop > - * iteration. > - * > - * When the AioContext is destroyed, pending sqes are ignored and their > - * CqeHandlers are not invoked. > - * > - * This function must be called only when aio_has_io_uring() returns true. > - */ > -void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), > - void *opaque, CqeHandler *cqe_handler); > -#endif /* CONFIG_LINUX_IO_URING */ > - > #endif > diff --git a/include/qemu/aio.h b/include/qemu/aio.h > new file mode 100644 > index 00000000000..8cca2360d1a > --- /dev/null > +++ b/include/qemu/aio.h > @@ -0,0 +1,852 @@ > +/* > + * QEMU aio implementation > + * > + * Copyright IBM, Corp. 2008 > + * > + * Authors: > + * Anthony Liguori <[email protected]> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#ifndef QEMU_AIO_H > +#define QEMU_AIO_H > + > +#ifdef CONFIG_LINUX_IO_URING > +#include <liburing.h> > +#endif > +#include "qemu/coroutine-core.h" > +#include "qemu/queue.h" > +#include "qemu/event_notifier.h" > +#include "qemu/lockcnt.h" > +#include "qemu/thread.h" > +#include "qemu/timer.h" > + > +struct MemReentrancyGuard; > + > +typedef struct AioHandler AioHandler; > +typedef QLIST_HEAD(, AioHandler) AioHandlerList; > +typedef void QEMUBHFunc(void *opaque); > +typedef bool AioPollFn(void *opaque); > +typedef void IOHandler(void *opaque); > + > +struct ThreadPoolAio; > +struct LinuxAioState; > +typedef struct LuringState LuringState; > + > +/* Is polling disabled? */ > +bool aio_poll_disabled(AioContext *ctx); > + > +#ifdef CONFIG_LINUX_IO_URING > +/* > + * Each io_uring request must have a unique CqeHandler that processes the > cqe. > + * The lifetime of a CqeHandler must be at least from aio_add_sqe() until > + * ->cb() invocation. > + */ > +typedef struct CqeHandler CqeHandler; > +struct CqeHandler { > + /* Called by the AioContext when the request has completed */ > + void (*cb)(CqeHandler *handler); > + > + /* Used internally, do not access this */ > + QSIMPLEQ_ENTRY(CqeHandler) next; > + > + /* This field is filled in before ->cb() is called */ > + struct io_uring_cqe cqe; > +}; > + > +typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ; > +#endif /* CONFIG_LINUX_IO_URING */ > + > +/* Callbacks for file descriptor monitoring implementations */ > +typedef struct { > + /* > + * update: > + * @ctx: the AioContext > + * @old_node: the existing handler or NULL if this file descriptor is > being > + * monitored for the first time > + * @new_node: the new handler or NULL if this file descriptor is being > + * removed > + * > + * Add/remove/modify a monitored file descriptor. > + * > + * Called with ctx->list_lock acquired. > + */ > + void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler > *new_node); > + > + /* > + * wait: > + * @ctx: the AioContext > + * @ready_list: list for handlers that become ready > + * @timeout: maximum duration to wait, in nanoseconds > + * > + * Wait for file descriptors to become ready and place them on > ready_list. > + * > + * Called with ctx->list_lock incremented but not locked. > + * > + * Returns: number of ready file descriptors. > + */ > + int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t > timeout); > + > + /* > + * need_wait: > + * @ctx: the AioContext > + * > + * Tell aio_poll() when to stop userspace polling early because ->wait() > + * has fds ready. > + * > + * File descriptor monitoring implementations that cannot poll fd > readiness > + * from userspace should use aio_poll_disabled() here. This ensures that > + * file descriptors are not starved by handlers that frequently make > + * progress via userspace polling. > + * > + * Returns: true if ->wait() should be called, false otherwise. > + */ > + bool (*need_wait)(AioContext *ctx); > + > + /* > + * dispatch: > + * @ctx: the AioContext > + * > + * Dispatch any work that is specific to this file descriptor monitoring > + * implementation. Usually the event loop's generic file descriptor > + * monitoring, BH, and timer dispatching code is sufficient, but file > + * descriptor monitoring implementations offering additional > functionality > + * may need to implement this function for custom behavior. Called at a > + * point in the event loop when it is safe to invoke user-defined > + * callbacks. > + * > + * This function is optional and may be NULL. > + * > + * Returns: true if progress was made (see aio_poll()'s return value), > + * false otherwise. > + */ > + bool (*dispatch)(AioContext *ctx); > + > + /* > + * gsource_prepare: > + * @ctx: the AioContext > + * > + * Prepare for the glib event loop to wait for events instead of the > usual > + * ->wait() call. See glib's GSourceFuncs->prepare(). > + */ > + void (*gsource_prepare)(AioContext *ctx); > + > + /* > + * gsource_check: > + * @ctx: the AioContext > + * > + * Called by the glib event loop from glib's GSourceFuncs->check() after > + * waiting for events. > + * > + * Returns: true when ready to be dispatched. > + */ > + bool (*gsource_check)(AioContext *ctx); > + > + /* > + * gsource_dispatch: > + * @ctx: the AioContext > + * @ready_list: list for handlers that become ready > + * > + * Place ready AioHandlers on ready_list. Called as part of the glib > event > + * loop from glib's GSourceFuncs->dispatch(). > + * > + * Called with list_lock incremented. > + */ > + void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list); > + > +#ifdef CONFIG_LINUX_IO_URING > + /** > + * add_sqe: Add an io_uring sqe for submission. > + * @prep_sqe: invoked with an sqe that should be prepared for submission > + * @opaque: user-defined argument to @prep_sqe() > + * @cqe_handler: the unique cqe handler associated with this request > + * > + * The caller's @prep_sqe() function is invoked to fill in the details of > + * the sqe. Do not call io_uring_sqe_set_data() on this sqe. > + * > + * The kernel may see the sqe as soon as @prep_sqe() returns or it may > take > + * until the next event loop iteration. > + * > + * This function is called from the current AioContext and is not > + * thread-safe. > + */ > + void (*add_sqe)(AioContext *ctx, > + void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), > + void *opaque, CqeHandler *cqe_handler); > +#endif /* CONFIG_LINUX_IO_URING */ > +} FDMonOps; > + > +/* > + * Each aio_bh_poll() call carves off a slice of the BH list, so that newly > + * scheduled BHs are not processed until the next aio_bh_poll() call. All > + * active aio_bh_poll() calls chain their slices together in a list, so that > + * nested aio_bh_poll() calls process all scheduled bottom halves. > + */ > +typedef QSLIST_HEAD(, QEMUBH) BHList; > +typedef struct BHListSlice BHListSlice; > +struct BHListSlice { > + BHList bh_list; > + QSIMPLEQ_ENTRY(BHListSlice) next; > +}; > + > +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; > + > +typedef struct AioPolledEvent { > + int64_t ns; /* current polling time in nanoseconds */ > +} AioPolledEvent; > + > +struct AioContext { > + GSource source; > + > + /* Used by AioContext users to protect from multi-threaded access. */ > + QemuRecMutex lock; > + > + /* > + * Keep track of readers and writers of the block layer graph. > + * This is essential to avoid performing additions and removal > + * of nodes and edges from block graph while some > + * other thread is traversing it. > + */ > + struct BdrvGraphRWlock *bdrv_graph; > + > + /* The list of registered AIO handlers. Protected by ctx->list_lock. */ > + AioHandlerList aio_handlers; > + > + /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. > */ > + AioHandlerList deleted_aio_handlers; > + > + /* Used to avoid unnecessary event_notifier_set calls in aio_notify; > + * only written from the AioContext home thread, or under the BQL in > + * the case of the main AioContext. However, it is read from any > + * thread so it is still accessed with atomic primitives. > + * > + * If this field is 0, everything (file descriptors, bottom halves, > + * timers) will be re-evaluated before the next blocking poll() or > + * io_uring wait; therefore, the event_notifier_set call can be > + * skipped. If it is non-zero, you may need to wake up a concurrent > + * aio_poll or the glib main event loop, making event_notifier_set > + * necessary. > + * > + * Bit 0 is reserved for GSource usage of the AioContext, and is 1 > + * between a call to aio_ctx_prepare and the next call to aio_ctx_check. > + * Bits 1-31 simply count the number of active calls to aio_poll > + * that are in the prepare or poll phase. > + * > + * The GSource and aio_poll must use a different mechanism because > + * there is no certainty that a call to GSource's prepare callback > + * (via g_main_context_prepare) is indeed followed by check and > + * dispatch. It's not clear whether this would be a bug, but let's > + * play safe and allow it---it will just cause extra calls to > + * event_notifier_set until the next call to dispatch. > + * > + * Instead, the aio_poll calls include both the prepare and the > + * dispatch phase, hence a simple counter is enough for them. > + */ > + uint32_t notify_me; > + > + /* A lock to protect between QEMUBH and AioHandler adders and deleter, > + * and to ensure that no callbacks are removed while we're walking and > + * dispatching them. > + */ > + QemuLockCnt list_lock; > + > + /* Bottom Halves pending aio_bh_poll() processing */ > + BHList bh_list; > + > + /* Chained BH list slices for each nested aio_bh_poll() call */ > + QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list; > + > + /* Used by aio_notify. > + * > + * "notified" is used to avoid expensive event_notifier_test_and_clear > + * calls. When it is clear, the EventNotifier is clear, or one thread > + * is going to clear "notified" before processing more events. False > + * positives are possible, i.e. "notified" could be set even though the > + * EventNotifier is clear. > + * > + * Note that event_notifier_set *cannot* be optimized the same way. For > + * more information on the problem that would result, see "#ifdef BUG2" > + * in the docs/aio_notify_accept.promela formal model. > + */ > + bool notified; > + EventNotifier notifier; > + > + QSLIST_HEAD(, Coroutine) scheduled_coroutines; > + QEMUBH *co_schedule_bh; > + > + int thread_pool_min; > + int thread_pool_max; > + /* Thread pool for performing work and receiving completion callbacks. > + * Has its own locking. > + */ > + struct ThreadPoolAio *thread_pool; > + > +#ifdef CONFIG_LINUX_AIO > + struct LinuxAioState *linux_aio; > +#endif > +#ifdef CONFIG_LINUX_IO_URING > + /* State for file descriptor monitoring using Linux io_uring */ > + struct io_uring fdmon_io_uring; > + AioHandlerSList submit_list; > + void *io_uring_fd_tag; > + > + /* Pending callback state for cqe handlers */ > + CqeHandlerSimpleQ cqe_handler_ready_list; > +#endif /* CONFIG_LINUX_IO_URING */ > + > + /* TimerLists for calling timers - one per clock type. Has its own > + * locking. > + */ > + QEMUTimerListGroup tlg; > + > + /* Number of AioHandlers without .io_poll() */ > + int poll_disable_cnt; > + > + /* Polling mode parameters */ > + int64_t poll_max_ns; /* maximum polling time in nanoseconds */ > + int64_t poll_grow; /* polling time growth factor */ > + int64_t poll_shrink; /* polling time shrink factor */ > + > + /* AIO engine parameters */ > + int64_t aio_max_batch; /* maximum number of requests in a batch */ > + > + /* > + * List of handlers participating in userspace polling. Protected by > + * ctx->list_lock. Iterated and modified mostly by the event loop thread > + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() > + * only touches the list to delete nodes if ctx->list_lock's count is > zero. > + */ > + AioHandlerList poll_aio_handlers; > + > + /* Are we in polling mode or monitoring file descriptors? */ > + bool poll_started; > + > + /* epoll(7) state used when built with CONFIG_EPOLL */ > + int epollfd; > + > + /* The GSource unix fd tag for epollfd */ > + void *epollfd_tag; > + > + const FDMonOps *fdmon_ops; > + > + /* Was aio_context_new() successful? */ > + bool initialized; > +}; > + > +/** > + * aio_context_new: Allocate a new AioContext. > + * > + * AioContext provide a mini event-loop that can be waited on synchronously. > + * They also provide bottom halves, a service to execute a piece of code > + * as soon as possible. > + */ > +AioContext *aio_context_new(Error **errp); > + > +/** > + * aio_context_ref: > + * @ctx: The AioContext to operate on. > + * > + * Add a reference to an AioContext. > + */ > +void aio_context_ref(AioContext *ctx); > + > +/** > + * aio_context_unref: > + * @ctx: The AioContext to operate on. > + * > + * Drop a reference to an AioContext. > + */ > +void aio_context_unref(AioContext *ctx); > + > +/** > + * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that > will > + * run only once and as soon as possible. > + * > + * @name: A human-readable identifier for debugging purposes. > + */ > +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void > *opaque, > + const char *name); > + > +/** > + * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will > run > + * only once and as soon as possible. > + * > + * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as > the > + * name string. > + */ > +#define aio_bh_schedule_oneshot(ctx, cb, opaque) \ > + aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb))) > + > +/** > + * aio_bh_new_full: Allocate a new bottom half structure. > + * > + * Bottom halves are lightweight callbacks whose invocation is guaranteed > + * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure > + * is opaque and must be allocated prior to its use. > + * > + * @name: A human-readable identifier for debugging purposes. > + * @reentrancy_guard: A guard set when entering a cb to prevent > + * device-reentrancy issues > + */ > +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, > + const char *name, struct MemReentrancyGuard > *reentrancy_guard); > + > +/** > + * aio_bh_new: Allocate a new bottom half structure > + * > + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name > + * string. > + */ > +#define aio_bh_new(ctx, cb, opaque) \ > + aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL) > + > +/** > + * aio_bh_new_guarded: Allocate a new bottom half structure with a > + * reentrancy_guard > + * > + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name > + * string. > + */ > +#define aio_bh_new_guarded(ctx, cb, opaque, guard) \ > + aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard) > + > +/** > + * aio_notify: Force processing of pending events. > + * > + * Similar to signaling a condition variable, aio_notify forces > + * aio_poll to exit, so that the next call will re-examine pending events. > + * The caller of aio_notify will usually call aio_poll again very soon, > + * or go through another iteration of the GLib main loop. Hence, aio_notify > + * also has the side effect of recalculating the sets of file descriptors > + * that the main loop waits for. > + * > + * Calling aio_notify is rarely necessary, because for example scheduling > + * a bottom half calls it already. > + */ > +void aio_notify(AioContext *ctx); > + > +/** > + * aio_notify_accept: Acknowledge receiving an aio_notify. > + * > + * aio_notify() uses an EventNotifier in order to wake up a sleeping > + * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are > + * usually rare, but the AioContext has to clear the EventNotifier on > + * every aio_poll() or g_main_context_iteration() in order to avoid > + * busy waiting. This event_notifier_test_and_clear() cannot be done > + * using the usual aio_context_set_event_notifier(), because it must > + * be done before processing all events (file descriptors, bottom halves, > + * timers). > + * > + * aio_notify_accept() is an optimized event_notifier_test_and_clear() > + * that is specific to an AioContext's notifier; it is used internally > + * to clear the EventNotifier only if aio_notify() had been called. > + */ > +void aio_notify_accept(AioContext *ctx); > + > +/** > + * aio_bh_call: Executes callback function of the specified BH. > + */ > +void aio_bh_call(QEMUBH *bh); > + > +/** > + * aio_bh_poll: Poll bottom halves for an AioContext. > + * > + * These are internal functions used by the QEMU main loop. > + * And notice that multiple occurrences of aio_bh_poll cannot > + * be called concurrently > + */ > +int aio_bh_poll(AioContext *ctx); > + > +/** > + * qemu_bh_schedule: Schedule a bottom half. > + * > + * Scheduling a bottom half interrupts the main loop and causes the > + * execution of the callback that was passed to qemu_bh_new. > + * > + * Bottom halves that are scheduled from a bottom half handler are instantly > + * invoked. This can create an infinite loop if a bottom half handler > + * schedules itself. > + * > + * @bh: The bottom half to be scheduled. > + */ > +void qemu_bh_schedule(QEMUBH *bh); > + > +/** > + * qemu_bh_cancel: Cancel execution of a bottom half. > + * > + * Canceling execution of a bottom half undoes the effect of calls to > + * qemu_bh_schedule without freeing its resources yet. While cancellation > + * itself is also wait-free and thread-safe, it can of course race with the > + * loop that executes bottom halves unless you are holding the iothread > + * mutex. This makes it mostly useless if you are not holding the mutex. > + * > + * @bh: The bottom half to be canceled. > + */ > +void qemu_bh_cancel(QEMUBH *bh); > + > +/** > + *qemu_bh_delete: Cancel execution of a bottom half and free its resources. > + * > + * Deleting a bottom half frees the memory that was allocated for it by > + * qemu_bh_new. It also implies canceling the bottom half if it was > + * scheduled. > + * This func is async. The bottom half will do the delete action at the > finial > + * end. > + * > + * @bh: The bottom half to be deleted. > + */ > +void qemu_bh_delete(QEMUBH *bh); > + > +/* Return whether there are any pending callbacks from the GSource > + * attached to the AioContext, before g_poll is invoked. > + * > + * This is used internally in the implementation of the GSource. > + */ > +bool aio_prepare(AioContext *ctx); > + > +/* Return whether there are any pending callbacks from the GSource > + * attached to the AioContext, after g_poll is invoked. > + * > + * This is used internally in the implementation of the GSource. > + */ > +bool aio_pending(AioContext *ctx); > + > +/* Dispatch any pending callbacks from the GSource attached to the > AioContext. > + * > + * This is used internally in the implementation of the GSource. > + */ > +void aio_dispatch(AioContext *ctx); > + > +/* Progress in completing AIO work to occur. This can issue new pending > + * aio as a result of executing I/O completion or bh callbacks. > + * > + * Return whether any progress was made by executing AIO or bottom half > + * handlers. If @blocking == true, this should always be true except > + * if someone called aio_notify. > + * > + * If there are no pending bottom halves, but there are pending AIO > + * operations, it may not be possible to make any progress without > + * blocking. If @blocking is true, this function will wait until one > + * or more AIO events have completed, to ensure something has moved > + * before returning. > + */ > +bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking); > + > +/* Register a file descriptor and associated callbacks. Behaves very > similarly > + * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will > + * be invoked when using aio_poll(). > + * > + * Code that invokes AIO completion functions should rely on this function > + * instead of qemu_set_fd_handler[2]. > + */ > +void aio_set_fd_handler(AioContext *ctx, > + int fd, > + IOHandler *io_read, > + IOHandler *io_write, > + AioPollFn *io_poll, > + IOHandler *io_poll_ready, > + void *opaque); > + > +/* Register an event notifier and associated callbacks. Behaves very > similarly > + * to event_notifier_set_handler. Unlike event_notifier_set_handler, these > callbacks > + * will be invoked when using aio_poll(). > + * > + * Code that invokes AIO completion functions should rely on this function > + * instead of event_notifier_set_handler. > + */ > +void aio_set_event_notifier(AioContext *ctx, > + EventNotifier *notifier, > + EventNotifierHandler *io_read, > + AioPollFn *io_poll, > + EventNotifierHandler *io_poll_ready); > + > +/* > + * Set polling begin/end callbacks for an event notifier that has already > been > + * registered with aio_set_event_notifier. Do nothing if the event notifier > is > + * not registered. > + * > + * Note that if the io_poll_end() callback (or the entire notifier) is > removed > + * during polling, it will not be called, so an io_poll_begin() is not > + * necessarily always followed by an io_poll_end(). > + */ > +void aio_set_event_notifier_poll(AioContext *ctx, > + EventNotifier *notifier, > + EventNotifierHandler *io_poll_begin, > + EventNotifierHandler *io_poll_end); > + > +/* Return a GSource that lets the main loop poll the file descriptors > attached > + * to this AioContext. > + */ > +GSource *aio_get_g_source(AioContext *ctx); > + > +/* Return the ThreadPoolAio bound to this AioContext */ > +struct ThreadPoolAio *aio_get_thread_pool(AioContext *ctx); > + > +/* Setup the LinuxAioState bound to this AioContext */ > +struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); > + > +/* Return the LinuxAioState bound to this AioContext */ > +struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); > + > +/** > + * aio_timer_new_with_attrs: > + * @ctx: the aio context > + * @type: the clock type > + * @scale: the scale > + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values > + * to assign > + * @cb: the callback to call on timer expiry > + * @opaque: the opaque pointer to pass to the callback > + * > + * Allocate a new timer (with attributes) attached to the context @ctx. > + * The function is responsible for memory allocation. > + * > + * The preferred interface is aio_timer_init or aio_timer_init_with_attrs. > + * Use that unless you really need dynamic memory allocation. > + * > + * Returns: a pointer to the new timer > + */ > +static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx, > + QEMUClockType type, > + int scale, int attributes, > + QEMUTimerCB *cb, void > *opaque) > +{ > + return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque); > +} > + > +/** > + * aio_timer_new: > + * @ctx: the aio context > + * @type: the clock type > + * @scale: the scale > + * @cb: the callback to call on timer expiry > + * @opaque: the opaque pointer to pass to the callback > + * > + * Allocate a new timer attached to the context @ctx. > + * See aio_timer_new_with_attrs for details. > + * > + * Returns: a pointer to the new timer > + */ > +static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type, > + int scale, > + QEMUTimerCB *cb, void *opaque) > +{ > + return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque); > +} > + > +/** > + * aio_timer_init_with_attrs: > + * @ctx: the aio context > + * @ts: the timer > + * @type: the clock type > + * @scale: the scale > + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values > + * to assign > + * @cb: the callback to call on timer expiry > + * @opaque: the opaque pointer to pass to the callback > + * > + * Initialise a new timer (with attributes) attached to the context @ctx. > + * The caller is responsible for memory allocation. > + */ > +static inline void aio_timer_init_with_attrs(AioContext *ctx, > + QEMUTimer *ts, QEMUClockType > type, > + int scale, int attributes, > + QEMUTimerCB *cb, void *opaque) > +{ > + timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque); > +} > + > +/** > + * aio_timer_init: > + * @ctx: the aio context > + * @ts: the timer > + * @type: the clock type > + * @scale: the scale > + * @cb: the callback to call on timer expiry > + * @opaque: the opaque pointer to pass to the callback > + * > + * Initialise a new timer attached to the context @ctx. > + * See aio_timer_init_with_attrs for details. > + */ > +static inline void aio_timer_init(AioContext *ctx, > + QEMUTimer *ts, QEMUClockType type, > + int scale, > + QEMUTimerCB *cb, void *opaque) > +{ > + timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque); > +} > + > +/** > + * aio_compute_timeout: > + * @ctx: the aio context > + * > + * Compute the timeout that a blocking aio_poll should use. > + */ > +int64_t aio_compute_timeout(AioContext *ctx); > + > +/** > + * aio_co_schedule: > + * @ctx: the aio context > + * @co: the coroutine > + * > + * Start a coroutine on a remote AioContext. > + * > + * The coroutine must not be entered by anyone else while aio_co_schedule() > + * is active. In addition the coroutine must have yielded unless ctx > + * is the context in which the coroutine is running (i.e. the value of > + * qemu_get_current_aio_context() from the coroutine itself). > + */ > +void aio_co_schedule(AioContext *ctx, Coroutine *co); > + > +/** > + * aio_co_reschedule_self: > + * @new_ctx: the new context > + * > + * Move the currently running coroutine to new_ctx. If the coroutine is > already > + * running in new_ctx, do nothing. > + * > + * Note that this function cannot reschedule from iohandler_ctx to > + * qemu_aio_context. > + */ > +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); > + > +/** > + * aio_co_wake: > + * @co: the coroutine > + * > + * Restart a coroutine on the AioContext where it was running last, thus > + * preventing coroutines from jumping from one context to another when they > + * go to sleep. > + * > + * aio_co_wake may be executed either in coroutine or non-coroutine > + * context. The coroutine must not be entered by anyone else while > + * aio_co_wake() is active. > + * > + * If `co`'s AioContext differs from the current AioContext, this will call > + * aio_co_schedule(), which makes this safe to use even when `co` has not > + * yielded yet. In such a case, it will be entered once it yields. > + * > + * In contrast, if `co`'s AioContext is equal to the current one, it is > + * required for `co` to currently be yielding. This is generally the case > + * if the caller is not in `co` (i.e. invoked by `co`), because the only > + * other way for the caller to be running then is for `co` to currently be > + * yielding. > + * > + * Therefore, if there is no way for the caller to be invoked/entered by > + * `co`, it is generally safe to call this regardless of whether `co` is > + * known to already be yielding or not -- it only has to yield at some > + * point. > + */ > +void aio_co_wake(Coroutine *co); > + > +/** > + * aio_co_enter: > + * @ctx: the context to run the coroutine > + * @co: the coroutine to run > + * > + * Enter a coroutine in the specified AioContext. > + */ > +void aio_co_enter(AioContext *ctx, Coroutine *co); > + > +/** > + * Return the AioContext whose event loop runs in the current thread. > + * > + * If called from an IOThread this will be the IOThread's AioContext. If > + * called from the main thread or with the "big QEMU lock" taken it > + * will be the main loop AioContext. > + * > + * Note that the return value is never the main loop's iohandler_ctx and the > + * return value is the main loop AioContext instead. > + */ > +AioContext *qemu_get_current_aio_context(void); > + > +void qemu_set_current_aio_context(AioContext *ctx); > + > +/** > + * aio_context_setup: > + * @ctx: the aio context > + * @errp: error pointer > + * > + * Initialize the aio context. > + * > + * Returns: true on success, false otherwise > + */ > +bool aio_context_setup(AioContext *ctx, Error **errp); > + > +/** > + * aio_context_destroy: > + * @ctx: the aio context > + * > + * Destroy the aio context. > + */ > +void aio_context_destroy(AioContext *ctx); > + > +/** > + * aio_context_set_poll_params: > + * @ctx: the aio context > + * @max_ns: how long to busy poll for, in nanoseconds > + * @grow: polling time growth factor > + * @shrink: polling time shrink factor > + * > + * Poll mode can be disabled by setting poll_max_ns to 0. > + */ > +void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, > + int64_t grow, int64_t shrink, > + Error **errp); > + > +/** > + * aio_context_set_aio_params: > + * @ctx: the aio context > + * @max_batch: maximum number of requests in a batch, 0 means that the > + * engine will use its default > + */ > +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch); > + > +/** > + * aio_context_set_thread_pool_params: > + * @ctx: the aio context > + * @min: min number of threads to have readily available in the thread pool > + * @min: max number of threads the thread pool can contain > + */ > +void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min, > + int64_t max, Error **errp); > + > +#ifdef CONFIG_LINUX_IO_URING > +/** > + * aio_has_io_uring: Return whether io_uring is available. > + * > + * io_uring is either available in all AioContexts or in none, so this only > + * needs to be called once from within any thread's AioContext. > + */ > +static inline bool aio_has_io_uring(void) > +{ > + AioContext *ctx = qemu_get_current_aio_context(); > + return ctx->fdmon_ops->add_sqe; > +} > + > +/** > + * aio_add_sqe: Add an io_uring sqe for submission. > + * @prep_sqe: invoked with an sqe that should be prepared for submission > + * @opaque: user-defined argument to @prep_sqe() > + * @cqe_handler: the unique cqe handler associated with this request > + * > + * The caller's @prep_sqe() function is invoked to fill in the details of the > + * sqe. Do not call io_uring_sqe_set_data() on this sqe. > + * > + * The sqe is submitted by the current AioContext. The kernel may see the sqe > + * as soon as @prep_sqe() returns or it may take until the next event loop > + * iteration. > + * > + * When the AioContext is destroyed, pending sqes are ignored and their > + * CqeHandlers are not invoked. > + * > + * This function must be called only when aio_has_io_uring() returns true. > + */ > +void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque), > + void *opaque, CqeHandler *cqe_handler); > +#endif /* CONFIG_LINUX_IO_URING */ > + > +#endif > diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h > index 0d55c636b21..8c1241a2c11 100644 > --- a/include/qemu/main-loop.h > +++ b/include/qemu/main-loop.h > @@ -25,7 +25,7 @@ > #ifndef QEMU_MAIN_LOOP_H > #define QEMU_MAIN_LOOP_H > > -#include "block/aio.h" > +#include "qemu/aio.h" > #include "qom/object.h" > #include "system/event-loop-base.h" > > @@ -431,7 +431,7 @@ void qemu_cond_timedwait_bql(QemuCond *cond, int ms); > #define qemu_bh_new(cb, opaque) \ > qemu_bh_new_full((cb), (opaque), (stringify(cb)), NULL) > QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name, > - MemReentrancyGuard *reentrancy_guard); > + struct MemReentrancyGuard *reentrancy_guard); > void qemu_bh_schedule_idle(QEMUBH *bh); > > enum { > --
Should block/aio.h include qemu/aio.h? User can include them both where needed. Otherwise looks okay. Reviewed-by: Prasad Pandit <[email protected]> Thank you. --- - Prasad
