On 8/19/25 7:23 PM, Brian Song wrote:
>
>
> On 8/19/25 6:26 PM, Bernd Schubert wrote:
>>
>>
>> On 8/19/25 03:15, Brian Song wrote:
>>>
>>>
>>> On 8/18/25 7:04 PM, Bernd Schubert wrote:
>>>>
>>>>
>>>> On 8/17/25 01:13, Brian Song wrote:
>>>>>
>>>>>
>>>>> On 8/14/25 11:46 PM, Brian Song wrote:
>>>>>> From: Brian Song <hibrians...@gmail.com>
>>>>>>
>>>>>> This patch adds a new export option for storage-export-daemon to
>>>>>> enable
>>>>>> or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
>>>>>> by default). It also implements the protocol handshake with the Linux
>>>>>> kernel during the FUSE-over-io_uring initialization phase.
>>>>>>
>>>>>> See: https://docs.kernel.org/filesystems/fuse-io-uring.html
>>>>>>
>>>>>> The kernel documentation describes in detail how FUSE-over-io_uring
>>>>>> works. This patch implements the Initial SQE stage shown in
>>>>>> thediagram:
>>>>>> it initializes one queue per IOThread, each currently supporting a
>>>>>> single submission queue entry (SQE). When the FUSE driver sends the
>>>>>> first FUSE request (FUSE_INIT), storage-export-daemon calls
>>>>>> fuse_uring_start() to complete initialization, ultimately submitting
>>>>>> the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
>>>>>> successful initialization with the kernel.
>>>>>>
>>>>>> Suggested-by: Kevin Wolf <kw...@redhat.com>
>>>>>> Suggested-by: Stefan Hajnoczi <stefa...@redhat.com>
>>>>>> Signed-off-by: Brian Song <hibrians...@gmail.com>
>>>>>> ---
>>>>>>     block/export/fuse.c                  | 161 +++++++++++++++++++
>>>>>> +++++---
>>>>>>     docs/tools/qemu-storage-daemon.rst   |  11 +-
>>>>>>     qapi/block-export.json               |   5 +-
>>>>>>     storage-daemon/qemu-storage-daemon.c |   1 +
>>>>>>     util/fdmon-io_uring.c                |   5 +-
>>>>>>     5 files changed, 159 insertions(+), 24 deletions(-)
>>>>>>
>>>>>> diff --git a/block/export/fuse.c b/block/export/fuse.c
>>>>>> index c0ad4696ce..59fa79f486 100644
>>>>>> --- a/block/export/fuse.c
>>>>>> +++ b/block/export/fuse.c
>>>>>> @@ -48,6 +48,11 @@
>>>>>>     #include <linux/fs.h>
>>>>>>     #endif
>>>>>>
>>>>>> +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
>>>>>> +
>>>>>> +/* room needed in buffer to accommodate header */
>>>>>> +#define FUSE_BUFFER_HEADER_SIZE 0x1000
>>>>>> +
>>>>>>     /* Prevent overly long bounce buffer allocations */
>>>>>>     #define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 *
>>>>>> 1024 * 1024))
>>>>>>     /*
>>>>>> @@ -63,12 +68,31 @@
>>>>>>         (FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
>>>>>>
>>>>>>     typedef struct FuseExport FuseExport;
>>>>>> +typedef struct FuseQueue FuseQueue;
>>>>>> +
>>>>>> +typedef struct FuseRingEnt {
>>>>>> +    /* back pointer */
>>>>>> +    FuseQueue *q;
>>>>>> +
>>>>>> +    /* commit id of a fuse request */
>>>>>> +    uint64_t req_commit_id;
>>>>>> +
>>>>>> +    /* fuse request header and payload */
>>>>>> +    struct fuse_uring_req_header req_header;
>>>>>> +    void *op_payload;
>>>>>> +    size_t req_payload_sz;
>>>>>> +
>>>>>> +    /* The vector passed to the kernel */
>>>>>> +    struct iovec iov[2];
>>>>>> +
>>>>>> +    CqeHandler fuse_cqe_handler;
>>>>>> +} FuseRingEnt;
>>>>>>
>>>>>>     /*
>>>>>>      * One FUSE "queue", representing one FUSE FD from which
>>>>>> requests are fetched
>>>>>>      * and processed.  Each queue is tied to an AioContext.
>>>>>>      */
>>>>>> -typedef struct FuseQueue {
>>>>>> +struct FuseQueue {
>>>>>>         FuseExport *exp;
>>>>>>
>>>>>>         AioContext *ctx;
>>>>>> @@ -109,7 +133,12 @@ typedef struct FuseQueue {
>>>>>>          * Free this buffer with qemu_vfree().
>>>>>>          */
>>>>>>         void *spillover_buf;
>>>>>> -} FuseQueue;
>>>>>> +
>>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>>> +    int qid;
>>>>>> +    FuseRingEnt ent;
>>>>>> +#endif
>>>>>> +};
>>>>>>
>>>>>>     /*
>>>>>>      * Verify that FuseQueue.request_buf plus the spill-over
>>>>>> buffer together
>>>>>> @@ -148,6 +177,7 @@ struct FuseExport {
>>>>>>         bool growable;
>>>>>>         /* Whether allow_other was used as a mount option or not */
>>>>>>         bool allow_other;
>>>>>> +    bool is_uring;
>>>>>>
>>>>>>         mode_t st_mode;
>>>>>>         uid_t st_uid;
>>>>>> @@ -257,6 +287,93 @@ static const BlockDevOps
>>>>>> fuse_export_blk_dev_ops = {
>>>>>>         .drained_poll  = fuse_export_drained_poll,
>>>>>>     };
>>>>>>
>>>>>> +#ifdef CONFIG_LINUX_IO_URING
>>>>>> +
>>>>>> +static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req
>>>>>> *req,
>>>>>> +                    const unsigned int qid,
>>>>>> +                    const unsigned int commit_id)
>>>>>> +{
>>>>>> +    req->qid = qid;
>>>>>> +    req->commit_id = commit_id;
>>>>>> +    req->flags = 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe,
>>>>>> FuseQueue *q,
>>>>>> +               __u32 cmd_op)
>>>>>> +{
>>>>>> +    sqe->opcode = IORING_OP_URING_CMD;
>>>>>> +
>>>>>> +    sqe->fd = q->fuse_fd;
>>>>>> +    sqe->rw_flags = 0;
>>>>>> +    sqe->ioprio = 0;
>>>>>> +    sqe->off = 0;
>>>>>> +
>>>>>> +    sqe->cmd_op = cmd_op;
>>>>>> +    sqe->__pad1 = 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_prep_sqe_register(struct io_uring_sqe
>>>>>> *sqe, void *opaque)
>>>>>> +{
>>>>>> +    FuseQueue *q = opaque;
>>>>>> +    struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
>>>>>> +
>>>>>> +    fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
>>>>>> +
>>>>>> +    sqe->addr = (uint64_t)(q->ent.iov);
>>>>>> +    sqe->len = 2;
>>>>>> +
>>>>>> +    fuse_uring_sqe_set_req_data(req, q->qid, 0);
>>>>>> +}
>>>>>> +
>>>>>> +static void fuse_uring_submit_register(void *opaque)
>>>>>> +{
>>>>>> +    FuseQueue *q = opaque;
>>>>>> +    FuseExport *exp = q->exp;
>>>>>> +
>>>>>> +
>>>>>> +    aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q-
>>>>>> >ent.fuse_cqe_handler));
>>>>>
>>>>> I think there might be a tricky issue with the io_uring integration in
>>>>> QEMU. Currently, when the number of IOThreads goes above ~6 or 7,
>>>>> there’s a pretty high chance of a hang. I added some debug logging in
>>>>> the kernel’s fuse_uring_cmd() registration part, and noticed that the
>>>>> number of register calls is less than the total number of entries
>>>>> in the
>>>>> queue. In theory, we should be registering each entry for each queue.
>>>>
>>>> Did you also try to add logging at the top of fuse_uring_cmd()? I
>>>> wonder
>>>> if there is a start up race and if initial commands are just getting
>>>> refused. I had run into issues you are describing in some versions of
>>>> the -rfc patches, but thought that everything was fixed for that.
>>>> I.e. not excluded that there is still a kernel issue left.
>>>>
>>>> Thanks,
>>>> Bernd
>>>>
>>>>
>>>
>>> Yes. I added a printk at the beginning of fuse_uring_cmd(), another at
>>> the beginning of fuse_uring_register(), and one more at the end of
>>> fuse_uring_do_register(). Then I created and registered 20 queues, each
>>> with a single ring entry. It printed 37 times(diff every time) with
>>> opcode FUSE_IO_URING_CMD_REGISTER (would expect 20), and only 6 queues
>>> were registered successfully. The rest of fuse_uring_cmd (x31) exited
>>> inside the if (!fc->initialized) branch in fuse_uring_cmd()
>>>
>>> dmesg: https://gist.github.com/
>>> hibriansong/4eda6e7e92601df497282dcd56fd5470
>>
>> Thank you for the logs, could you try this?
>>
>> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
>> index 2aa20707f40b..cea57ad5d3ab 100644
>> --- a/fs/fuse/dev_uring.c
>> +++ b/fs/fuse/dev_uring.c
>> @@ -1324,6 +1324,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd,
>> unsigned int issue_flags)
>>          if (!fc->connected)
>>                  return -ENOTCONN;
>> +       /* Matches smp_wmb() in fuse_set_initialized() */
>> +       smp_rmb();
>> +
>>          /*
>>           * fuse_uring_register() needs the ring to be initialized,
>>           * we need to know the max payload size
>>
>>
>>
>> Thanks,
>> Bernd
>
> I realized the issue actually comes from QEMU handling the FUSE_INIT
> request. After I processed outargs, I didn't send the response back to
> the kernel before starting the fuse-over-io_uring initialization. So
> it's possible that the 20 registration requests submitted via
> io_uring_cmd() reach the kernel before process_init_reply() has run and
> set fc->initialized = 1, which causes fuse_uring_cmd to bail out
> repeatedly.
>
> I also noticed that in libfuse, they first send the init request
> response, then allocate queues and submit the register SQEs. But even
> there, during the fuse-over-io_uring init after sending the response, if
> the kernel hasn't finished process_init_reply() and set fc->initialized
> = 1, wouldn't they run into a similar issue fuse_uring_cmd repeatedly
> bailing on register requests because fc->initialized isn't set yet?

Hi Bernd,

Nvm, I think writing to /dev/fuse fd is blocking.

Thanks so much for your feedback!

Best,
Brian

Reply via email to