When a termination signal is received, the storage-export-daemon stops the export, exits the main loop (main_loop_wait), and begins resource cleanup. However, some FUSE_IO_URING_CMD_COMMIT_AND_FETCH SQEs may remain pending in the kernel, waiting for incoming FUSE requests.
Currently, there is no way to manually cancel these pending CQEs in the kernel. As a result, after export termination, the related data structures might be deleted before the pending CQEs return, causing the CQE handler to be invoked after it has been freed, which may lead to a segfault. As a workaround, when submitting an SQE to the kernel, we increment the block reference (blk_exp_ref) to prevent the CQE handler from being deleted during export termination. Once the CQE is received, we decrement the reference (blk_exp_unref). However, this introduces a new issue: if no new FUSE requests arrive, the pending SQEs held by the kernel will never complete. Consequently, the export reference count never drops to zero, preventing the export from shutting down cleanly. To resolve this, we schedule a Bottom Half (BH) for each FUSE queue during the export shutdown phase. The BH closes the fuse_fd to prevent race conditions, while the session is unmounted during the remainder of the shutdown sequence. This explicitly aborts all pending SQEs in the kernel, forcing the corresponding CQEs to return. This triggers the release of held references, allowing the export to be freed safely. Suggested-by: Kevin Wolf <[email protected]> Suggested-by: Stefan Hajnoczi <[email protected]> Signed-off-by: Brian Song <[email protected]> --- block/export/fuse.c | 100 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 10 deletions(-) diff --git a/block/export/fuse.c b/block/export/fuse.c index c117e081cd..abae83041b 100644 --- a/block/export/fuse.c +++ b/block/export/fuse.c @@ -934,6 +934,57 @@ static void read_from_fuse_fd(void *opaque) qemu_coroutine_enter(co); } +#ifdef CONFIG_LINUX_IO_URING +static void fuse_export_delete_uring(FuseExport *exp) +{ + exp->is_uring = false; + exp->uring_started = false; + + for (int i = 0; i < exp->num_uring_queues; i++) { + FuseUringQueue *rq = &exp->uring_queues[i]; + + for (int j = 0; j < FUSE_DEFAULT_URING_QUEUE_DEPTH; j++) { + g_free(rq->ent[j].req_payload); + } + g_free(rq->ent); + } + + g_free(exp->uring_queues); +} +#endif + +/** + * The Linux kernel currently lacks support for asynchronous cancellation + * of FUSE-over-io_uring SQEs. This can lead to a race where an IOThread may + * access fuse_fd after it is closed but before pending SQEs are canceled, + * potentially operating on a newly reused file descriptor. + * + * Therefore, schedule a BH in the IOThread to close and invalidate fuse_fd, + * to avoid races on fuse_fd. + */ +#ifdef CONFIG_LINUX_IO_URING +static void close_fuse_fd(void *opaque) +{ + FuseQueue *q = opaque; + + if (q->fuse_fd >= 0) { + close(q->fuse_fd); + q->fuse_fd = -1; + } +} +#endif + +/** + * During exit in FUSE-over-io_uring mode, qemu-storage-daemon requests + * shutdown in main() and then immediately tears down the block export. + * However, SQEs already submitted under FUSE-over-io_uring may still complete + * and generate CQEs that continue to hold references to the block export, + * preventing it from being freed cleanly. + * + * Since the Linux kernel currently lacks support for asynchronous cancellation + * of FUSE-over-io_uring SQEs, this function aborts the connection and cancels + * all pending SQEs to ensure a safe teardown. + */ static void fuse_export_shutdown(BlockExport *blk_exp) { FuseExport *exp = container_of(blk_exp, FuseExport, common); @@ -949,18 +1000,42 @@ static void fuse_export_shutdown(BlockExport *blk_exp) */ g_hash_table_remove(exports, exp->mountpoint); } + +#ifdef CONFIG_LINUX_IO_URING + if (exp->uring_started) { + for (size_t i = 0; i < exp->num_fuse_queues; i++) { + FuseQueue *q = &exp->queues[i]; + + /* Queue 0's FD belongs to the FUSE session */ + if (i > 0) { + aio_bh_schedule_oneshot(q->ctx, close_fuse_fd, q); + } + } + + /* To cancel all pending SQEs */ + if (exp->fuse_session) { + if (exp->mounted) { + fuse_session_unmount(exp->fuse_session); + } + fuse_session_destroy(exp->fuse_session); + } + g_free(exp->mountpoint); + } +#endif } static void fuse_export_delete(BlockExport *blk_exp) { FuseExport *exp = container_of(blk_exp, FuseExport, common); - for (int i = 0; i < exp->num_fuse_queues; i++) { + for (size_t i = 0; i < exp->num_fuse_queues; i++) { FuseQueue *q = &exp->queues[i]; - /* Queue 0's FD belongs to the FUSE session */ - if (i > 0 && q->fuse_fd >= 0) { - close(q->fuse_fd); + if (!exp->uring_started) { + /* Queue 0's FD belongs to the FUSE session */ + if (i > 0 && q->fuse_fd >= 0) { + close(q->fuse_fd); + } } if (q->spillover_buf) { qemu_vfree(q->spillover_buf); @@ -968,15 +1043,20 @@ static void fuse_export_delete(BlockExport *blk_exp) } g_free(exp->queues); - if (exp->fuse_session) { - if (exp->mounted) { - fuse_session_unmount(exp->fuse_session); + if (exp->uring_started) { +#ifdef CONFIG_LINUX_IO_URING + fuse_export_delete_uring(exp); +#endif + } else { + if (exp->fuse_session) { + if (exp->mounted) { + fuse_session_unmount(exp->fuse_session); + } + fuse_session_destroy(exp->fuse_session); } - fuse_session_destroy(exp->fuse_session); + g_free(exp->mountpoint); } - - g_free(exp->mountpoint); } /** -- 2.43.0
