On 8/14/25 11:46 PM, Brian Song wrote:
From: Brian Song <hibrians...@gmail.com>

This patch adds a new export option for storage-export-daemon to enable
or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
by default). It also implements the protocol handshake with the Linux
kernel during the FUSE-over-io_uring initialization phase.

See: https://docs.kernel.org/filesystems/fuse-io-uring.html

The kernel documentation describes in detail how FUSE-over-io_uring
works. This patch implements the Initial SQE stage shown in thediagram:
it initializes one queue per IOThread, each currently supporting a
single submission queue entry (SQE). When the FUSE driver sends the
first FUSE request (FUSE_INIT), storage-export-daemon calls
fuse_uring_start() to complete initialization, ultimately submitting
the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
successful initialization with the kernel.

Suggested-by: Kevin Wolf <kw...@redhat.com>
Suggested-by: Stefan Hajnoczi <stefa...@redhat.com>
Signed-off-by: Brian Song <hibrians...@gmail.com>
---
  block/export/fuse.c                  | 161 ++++++++++++++++++++++++---
  docs/tools/qemu-storage-daemon.rst   |  11 +-
  qapi/block-export.json               |   5 +-
  storage-daemon/qemu-storage-daemon.c |   1 +
  util/fdmon-io_uring.c                |   5 +-
  5 files changed, 159 insertions(+), 24 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index c0ad4696ce..59fa79f486 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -48,6 +48,11 @@
  #include <linux/fs.h>
  #endif

+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
  /* Prevent overly long bounce buffer allocations */
  #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
  /*
@@ -63,12 +68,31 @@
      (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)

  typedef struct FuseExport FuseExport;
+typedef struct FuseQueue FuseQueue;
+
+typedef struct FuseRingEnt {
+    /* back pointer */
+    FuseQueue *q;
+
+    /* commit id of a fuse request */
+    uint64_t req_commit_id;
+
+    /* fuse request header and payload */
+    struct fuse_uring_req_header req_header;
+    void *op_payload;
+    size_t req_payload_sz;
+
+    /* The vector passed to the kernel */
+    struct iovec iov[2];
+
+    CqeHandler fuse_cqe_handler;
+} FuseRingEnt;

  /*
   * One FUSE "queue", representing one FUSE FD from which requests are fetched
   * and processed.  Each queue is tied to an AioContext.
   */
-typedef struct FuseQueue {
+struct FuseQueue {
      FuseExport *exp;

      AioContext *ctx;
@@ -109,7 +133,12 @@ typedef struct FuseQueue {
       * Free this buffer with qemu_vfree().
       */
      void *spillover_buf;
-} FuseQueue;
+
+#ifdef CONFIG_LINUX_IO_URING
+    int qid;
+    FuseRingEnt ent;
+#endif
+};

  /*
   * Verify that FuseQueue.request_buf plus the spill-over buffer together
@@ -148,6 +177,7 @@ struct FuseExport {
      bool growable;
      /* Whether allow_other was used as a mount option or not */
      bool allow_other;
+    bool is_uring;

      mode_t st_mode;
      uid_t st_uid;
@@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
      .drained_poll  = fuse_export_drained_poll,
  };

+#ifdef CONFIG_LINUX_IO_URING
+
+static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
+                    const unsigned int qid,
+                    const unsigned int commit_id)
+{
+    req->qid = qid;
+    req->commit_id = commit_id;
+    req->flags = 0;
+}
+
+static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
+               __u32 cmd_op)
+{
+    sqe->opcode = IORING_OP_URING_CMD;
+
+    sqe->fd = q->fuse_fd;
+    sqe->rw_flags = 0;
+    sqe->ioprio = 0;
+    sqe->off = 0;
+
+    sqe->cmd_op = cmd_op;
+    sqe->__pad1 = 0;
+}
+
+static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void 
*opaque)
+{
+    FuseQueue *q = opaque;
+    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
+
+    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
+
+    sqe->addr = (uint64_t)(q->ent.iov);
+    sqe->len = 2;
+
+    fuse_uring_sqe_set_req_data(req, q->qid, 0);
+}
+
+static void fuse_uring_submit_register(void *opaque)
+{
+    FuseQueue *q = opaque;
+    FuseExport *exp = q->exp;
+
+
+    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));

I think there might be a tricky issue with the io_uring integration in QEMU. Currently, when the number of IOThreads goes above ~6 or 7, there’s a pretty high chance of a hang. I added some debug logging in the kernel’s fuse_uring_cmd() registration part, and noticed that the number of register calls is less than the total number of entries in the queue. In theory, we should be registering each entry for each queue.

On the userspace side, everything seems normal, the number of aio_add_sqe() calls matches the number of IOThreads. But here’s the weird part: if I add a printf inside the while loop in fdmon-io_uring.c::fdmon_io_uring_wait(), suddenly everything works fine, and the kernel receives registration requests for all entries as expected.

    do {
        ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
        fprintf(stderr, "io_uring_submit_and_wait ret: %d\n", ret);
    } while (ret == -EINTR);

My guess is that printf is just slowing down the loop, or maybe there’s some implicit memory barrier happening. Obviously, the right fix isn’t to sprinkle fprintfs around. I suspect there might be a subtle synchronization/race issue here.

Brian

Reply via email to