From: Chengyu Zhu <[email protected]> Add a ublk backend powered by io_uring. This provides an alternative way to nbd for exposing an EROFS image as a block device.
Signed-off-by: Chengyu Zhu <[email protected]> Signed-off-by: Gao Xiang <[email protected]> --- Note that it still doesn't work on my test setup, so I won't apply this version for now. configure.ac | 34 ++ lib/Makefile.am | 9 +- lib/backends/ublk.c | 1427 +++++++++++++++++++++++++++++++++++++++++++ lib/liberofs_ublk.h | 53 ++ 4 files changed, 1522 insertions(+), 1 deletion(-) create mode 100644 lib/backends/ublk.c create mode 100644 lib/liberofs_ublk.h diff --git a/configure.ac b/configure.ac index 45b819053bd1..88113d8f35ff 100644 --- a/configure.ac +++ b/configure.ac @@ -185,6 +185,10 @@ AC_ARG_WITH(libnl3, [AS_HELP_STRING([--with-libnl3], [Enable and build with libnl3 support @<:@default=auto@:>@])]) +AC_ARG_ENABLE(ublk, + [AS_HELP_STRING([--enable-ublk], [enable ublk block device support @<:@default=no@:>@])], + [enable_ublk="$enableval"], [enable_ublk="no"]) + AC_ARG_ENABLE(s3, [AS_HELP_STRING([--enable-s3], [enable s3 image generation support @<:@default=no@:>@])], [enable_s3="$enableval"], [enable_s3="no"]) @@ -749,6 +753,29 @@ AS_IF([test "x$with_libnl3" != "xno"], [ ]) ]) +# Configure ublk (requires liburing) +have_ublk="no" +AS_IF([test "x$enable_ublk" = "xyes"], [ + PKG_CHECK_MODULES([liburing], [liburing >= 2.0], [ + # Paranoia: don't trust the result reported by pkgconfig before trying out + saved_LIBS="$LIBS" + saved_CPPFLAGS=${CPPFLAGS} + CPPFLAGS="${liburing_CFLAGS} ${CPPFLAGS}" + LIBS="${liburing_LIBS} $LIBS" + AC_CHECK_HEADERS([liburing.h],[ + AC_CHECK_LIB(uring, io_uring_queue_init, [], [ + AC_MSG_ERROR([liburing doesn't work properly])]) + AC_CHECK_DECL(io_uring_queue_init, [have_ublk="yes"], + [AC_MSG_ERROR([liburing doesn't work properly])], [[ +#include <liburing.h> + ]]) + ]) + LIBS="${saved_LIBS}" + CPPFLAGS="${saved_CPPFLAGS}"], [ + AC_MSG_ERROR([ublk requires liburing >= 2.0]) + ]) +]) + AS_IF([test "x$enable_s3" != "xno"], [ AS_IF( [test "x$have_libcurl" = "xyes" && \ @@ -789,6 +816,7 @@ AM_CONDITIONAL([ENABLE_S3], [test "x${have_s3}" = "xyes"]) AM_CONDITIONAL([ENABLE_STATIC_FUSE], [test "x${enable_static_fuse}" = "xyes"]) AM_CONDITIONAL([ENABLE_OCI], [test "x${have_oci}" = "xyes"]) AM_CONDITIONAL([ENABLE_FANOTIFY], [test "x${have_fanotify}" = "xyes"]) +AM_CONDITIONAL([ENABLE_UBLK], [test "x${have_ublk}" = "xyes"]) if test "x$have_uuid" = "xyes"; then AC_DEFINE([HAVE_LIBUUID], 1, [Define to 1 if libuuid is found]) @@ -870,6 +898,12 @@ if test "x$have_fanotify" = "xyes"; then [Define to 1 if fanotify backend is enabled]) fi +if test "x$have_ublk" = "xyes"; then + AC_DEFINE([HAVE_LIBURING], 1, [Define to 1 if liburing is found]) + AC_SUBST([liburing_LIBS]) + AC_SUBST([liburing_CFLAGS]) +fi + # Dump maximum block size AS_IF([test "x$erofs_cv_max_block_size" = "x"], [$erofs_cv_max_block_size = 4096], []) diff --git a/lib/Makefile.am b/lib/Makefile.am index 27bf71094bad..cecbe36309a1 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -33,7 +33,8 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \ $(top_srcdir)/lib/liberofs_gzran.h \ $(top_srcdir)/lib/liberofs_metabox.h \ $(top_srcdir)/lib/liberofs_nbd.h \ - $(top_srcdir)/lib/liberofs_s3.h + $(top_srcdir)/lib/liberofs_s3.h \ + $(top_srcdir)/lib/liberofs_ublk.h noinst_HEADERS += compressor.h if ENABLE_FANOTIFY @@ -96,6 +97,12 @@ if ENABLE_FANOTIFY liberofs_la_SOURCES += backends/fanotify.c endif endif + +if ENABLE_UBLK +liberofs_la_CFLAGS += ${liburing_CFLAGS} +liberofs_la_LDFLAGS += ${liburing_LIBS} +liberofs_la_SOURCES += backends/ublk.c +endif liberofs_la_SOURCES += remotes/oci.c remotes/docker_config.c liberofs_la_CFLAGS += ${json_c_CFLAGS} liberofs_la_LDFLAGS += ${json_c_LIBS} diff --git a/lib/backends/ublk.c b/lib/backends/ublk.c new file mode 100644 index 000000000000..86d2dc59df06 --- /dev/null +++ b/lib/backends/ublk.c @@ -0,0 +1,1427 @@ +// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 +/* + * Copyright (C) 2026 Tencent, Inc. + * http://www.tencent.com/ + */ +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/types.h> +#include <sys/eventfd.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "erofs/err.h" +#include "erofs/print.h" +#include "liberofs_ublk.h" + +#ifdef HAVE_LIBURING +#include <liburing.h> +#endif + +#ifndef PR_SET_IO_FLUSHER +#define PR_SET_IO_FLUSHER 49 +#endif + +#define UBLK_CMD_GET_QUEUE_AFFINITY 0x01 +#define UBLK_CMD_GET_DEV_INFO 0x02 +#define UBLK_CMD_ADD_DEV 0x04 +#define UBLK_CMD_DEL_DEV 0x05 +#define UBLK_CMD_START_DEV 0x06 +#define UBLK_CMD_STOP_DEV 0x07 +#define UBLK_CMD_SET_PARAMS 0x08 +#define UBLK_CMD_GET_PARAMS 0x09 +#define UBLK_CMD_START_USER_RECOVERY 0x10 +#define UBLK_CMD_END_USER_RECOVERY 0x11 + +#define UBLK_IO_FETCH_REQ 0x20 +#define UBLK_IO_COMMIT_AND_FETCH_REQ 0x21 +#define UBLK_IO_NEED_GET_DATA 0x22 + +#define UBLK_U_CMD_GET_QUEUE_AFFINITY \ + _IOR('u', UBLK_CMD_GET_QUEUE_AFFINITY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_DEV_INFO \ + _IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_ADD_DEV \ + _IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV \ + _IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_DEV \ + _IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_STOP_DEV \ + _IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_SET_PARAMS \ + _IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_PARAMS \ + _IOR('u', UBLK_CMD_GET_PARAMS, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_START_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_END_USER_RECOVERY \ + _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV_ASYNC \ + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) + +#define UBLK_U_IO_FETCH_REQ \ + _IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_COMMIT_AND_FETCH_REQ \ + _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) +#define UBLK_U_IO_NEED_GET_DATA \ + _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) + +#define UBLK_F_URING_CMD_COMP_IN_TASK (1ULL << 1) +#define UBLK_F_USER_RECOVERY (1ULL << 3) +#define UBLK_F_UNPRIVILEGED_DEV (1ULL << 5) +#define UBLK_F_CMD_IOCTL_ENCODE (1ULL << 6) +#define UBLK_F_USER_COPY (1ULL << 7) + +#define UBLK_S_DEV_QUIESCED 2 +#define UBLK_S_DEV_FAIL_IO 3 + +#define UBLK_IO_RES_NEED_GET_DATA 1 +#define UBLK_IO_RES_ABORT (-ENODEV) + +#define UBLKSRV_CMD_BUF_OFFSET 0 +#define UBLKSRV_IO_BUF_OFFSET 0x80000000 + +#define UBLK_MAX_QUEUE_DEPTH 4096 + +#define UBLK_IO_BUF_BITS 25 +#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1) +#define UBLK_TAG_OFF UBLK_IO_BUF_BITS +#define UBLK_TAG_BITS 16 +#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS) + +#define UBLK_IO_OP_READ 0 + +struct ublksrv_ctrl_cmd { + __u32 dev_id; + __u16 queue_id; + __u16 len; + __u64 addr; + __u64 data[1]; + __u16 dev_path_len; + __u16 pad; + __u32 reserved; +}; + +struct ublksrv_ctrl_dev_info { + __u16 nr_hw_queues; + __u16 queue_depth; + __u16 state; + __u16 pad0; + __u32 max_io_buf_bytes; + __u32 dev_id; + __s32 ublksrv_pid; + __u32 pad1; + __u64 flags; + __u64 ublksrv_flags; + __u32 owner_uid; + __u32 owner_gid; + __u64 reserved1; + __u64 reserved2; +}; + +struct ublksrv_io_cmd { + __u16 q_id; + __u16 tag; + __s32 result; + __u64 addr; +}; + +struct ublksrv_io_desc { + __u32 op_flags; + __u32 nr_sectors; + __u64 start_sector; + __u64 addr; +}; + +#define UBLK_PARAM_TYPE_BASIC (1 << 0) + +struct ublk_param_basic { +#define UBLK_ATTR_READ_ONLY (1 << 0) + __u32 attrs; + __u8 logical_bs_shift; + __u8 physical_bs_shift; + __u8 io_opt_shift; + __u8 io_min_shift; + __u32 max_sectors; + __u32 chunk_sectors; + __u64 dev_sectors; + __u64 virt_boundary_mask; +}; + +struct ublk_params { + __u32 len; + __u32 types; + struct ublk_param_basic basic; +}; + + +#ifdef HAVE_LIBURING + +#define EROFS_UBLK_DEF_NR_HW_QUEUES 1 +#define EROFS_UBLK_DEF_QUEUE_DEPTH 128 +#define EROFS_UBLK_DEF_MAX_IO_BUF_BYTES (512 * 1024) +#define EROFS_UBLK_DEF_BLK_BITS 12 + +#define UBLKSRV_IO_FREE (1U << 0) +#define UBLKSRV_NEED_COMMIT_RQ_COMP (1U << 2) +#define UBLKSRV_IO_NEED_GET_DATA (1U << 3) + +#define UBLKSRV_QUEUE_STOPPING (1U << 0) +#define UBLKSRV_QUEUE_IDLE (1U << 1) + +struct erofs_ublk_io { + unsigned int flags; + int result; +}; + +struct erofs_ublk_queue { + int q_id; + int q_depth; + struct erofs_ublk_dev *dev; + struct ublksrv_io_desc *io_cmd_buf; + struct erofs_ublk_io *ios; + void *io_buf; + size_t io_buf_size; + pthread_t thread; + unsigned int state; + cpu_set_t *cpuset; + int idle_ticks; + pthread_barrier_t *init_barrier; + struct io_uring ring; +}; + +struct erofs_ublk_dev { + int ctrl_fd; + int cdev_fd; + struct ublksrv_ctrl_dev_info dev_info; + struct ublk_params params; + struct erofs_ublk_queue *queues; + erofs_ublk_io_handler_t handler; + void *handler_ctx; + int running; + int stop_requested; + int stop_efd; + int recovering; + struct io_uring ctrl_ring; + int ctrl_ring_initialized; +}; + +#define UBLK_CTRL_DEV "/dev/ublk-control" +#define UBLK_CDEV_FMT "/dev/ublkc%d" + +#define UBLK_IDLE_TIMEOUT_TICKS 200 + +#define UBLK_MAX_DEVS 128 +static struct erofs_ublk_dev *ublk_devs[UBLK_MAX_DEVS]; + +static struct erofs_ublk_dev *ublk_get_dev(int dev_id) +{ + if (dev_id < 0 || dev_id >= UBLK_MAX_DEVS) + return NULL; + return ublk_devs[dev_id]; +} + +static volatile sig_atomic_t g_sig_dev_id = -1; + +static inline __u8 ublksrv_get_op(const struct ublksrv_io_desc *iod) +{ + return iod->op_flags & 0xff; +} + +static inline unsigned int ublk_cmd_buf_sz(int q_depth) +{ + unsigned int size = q_depth * sizeof(struct ublksrv_io_desc); + unsigned int page_size = getpagesize(); + + return (size + page_size - 1) & ~(page_size - 1); +} + +static inline void *ublk_get_io_buf(struct erofs_ublk_queue *q, int tag) +{ + return (char *)q->io_buf + tag * q->dev->dev_info.max_io_buf_bytes; +} + +static inline __u64 ublk_pos(__u16 q_id, __u16 tag, __u32 offset) +{ + return UBLKSRV_IO_BUF_OFFSET + + (((__u64)q_id << UBLK_QID_OFF) | + ((__u64)tag << UBLK_TAG_OFF) | + ((__u64)(offset & UBLK_IO_BUF_BITS_MASK))); +} + +static void ublk_apply_oom_protection(void) +{ + char path[64]; + int fd; + + snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", getpid()); + fd = open(path, O_RDWR); + if (fd >= 0) { + const char *val = "-1000"; + ssize_t ret = write(fd, val, strlen(val)); + + if (ret != (ssize_t)strlen(val)) + erofs_dbg("failed to set oom_score_adj: %s", + strerror(errno)); + close(fd); + } +} + +static void ublk_set_io_flusher(void) +{ + if (prctl(PR_SET_IO_FLUSHER, 0, 0, 0, 0) != 0) + erofs_dbg("prctl(PR_SET_IO_FLUSHER) failed: %s", + strerror(errno)); +} + +static int ublk_ctrl_ring_init(struct io_uring *ring) +{ + struct io_uring_params p = { + .flags = IORING_SETUP_SQE128, + }; + return io_uring_queue_init_params(4, ring, &p); +} + +static int ublk_ctrl_cmd_ring(struct io_uring *ring, int ctrl_fd, + __u32 cmd_op, + const struct ublksrv_ctrl_cmd *cmd_data) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct ublksrv_ctrl_cmd *cmd; + int ret; + + sqe = io_uring_get_sqe(ring); + if (!sqe) + return -ENOMEM; + + io_uring_prep_rw(IORING_OP_URING_CMD, sqe, ctrl_fd, NULL, 0, 0); + sqe->cmd_op = cmd_op; + + if (cmd_data) { + cmd = (struct ublksrv_ctrl_cmd *)sqe->cmd; + memcpy(cmd, cmd_data, sizeof(*cmd_data)); + } + + ret = io_uring_submit(ring); + if (ret < 0) { + erofs_err("io_uring_submit failed: %s", strerror(-ret)); + return ret; + } + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret < 0) { + erofs_err("io_uring_wait_cqe failed: %s", strerror(-ret)); + return ret; + } + + ret = cqe->res; + io_uring_cqe_seen(ring, cqe); + return ret; +} + +static int ublk_ctrl_cmd(int ctrl_fd, __u32 cmd_op, + const struct ublksrv_ctrl_cmd *cmd_data) +{ + struct io_uring ring; + int ret; + + ret = ublk_ctrl_ring_init(&ring); + if (ret < 0) { + erofs_err("io_uring_queue_init failed: %s", strerror(-ret)); + return ret; + } + + ret = ublk_ctrl_cmd_ring(&ring, ctrl_fd, cmd_op, cmd_data); + io_uring_queue_exit(&ring); + return ret; +} + +static int ublk_dev_ctrl_cmd(struct erofs_ublk_dev *dev, __u32 cmd_op, + const struct ublksrv_ctrl_cmd *cmd_data) +{ + return ublk_ctrl_cmd_ring(&dev->ctrl_ring, dev->ctrl_fd, + cmd_op, cmd_data); +} + +static int ublk_get_queue_affinity(struct erofs_ublk_dev *dev, int q_id, + cpu_set_t *cpuset) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = q_id; + cmd.len = sizeof(cpu_set_t); + cmd.addr = (__u64)(uintptr_t)cpuset; + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_GET_QUEUE_AFFINITY, &cmd); + if (ret < 0) + erofs_dbg("GET_QUEUE_AFFINITY failed for q%d: %s", + q_id, strerror(-ret)); + return ret; +} + +static int ublk_add_dev(struct erofs_ublk_dev *dev, + const struct erofs_ublk_dev_info *info) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + struct ublksrv_ctrl_dev_info *dev_info = &dev->dev_info; + int ret; + + memset(dev_info, 0, sizeof(*dev_info)); + dev_info->nr_hw_queues = info->nr_hw_queues; + dev_info->queue_depth = info->queue_depth; + dev_info->max_io_buf_bytes = info->max_io_buf_bytes; + dev_info->dev_id = info->dev_id; + + dev_info->flags = UBLK_F_CMD_IOCTL_ENCODE | + UBLK_F_URING_CMD_COMP_IN_TASK; + + dev_info->flags |= UBLK_F_USER_COPY; + + if (info->flags & EROFS_UBLK_F_UNPRIVILEGED) + dev_info->flags |= UBLK_F_UNPRIVILEGED_DEV; + if (info->flags & EROFS_UBLK_F_USER_RECOVERY) + dev_info->flags |= UBLK_F_USER_RECOVERY; + + cmd.dev_id = dev_info->dev_id; + cmd.queue_id = (__u16)-1; + cmd.len = sizeof(*dev_info); + cmd.addr = (__u64)(uintptr_t)dev_info; + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_ADD_DEV, &cmd); + if (ret < 0) { + erofs_err("UBLK_CMD_ADD_DEV failed: %s", strerror(-ret)); + return ret; + } + + erofs_info("ublk device %d added (queues=%d, depth=%d, io_buf=%u)", + dev_info->dev_id, dev_info->nr_hw_queues, + dev_info->queue_depth, dev_info->max_io_buf_bytes); + return 0; +} + +static int ublk_del_dev(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + + return ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_DEL_DEV, &cmd); +} + +static int ublk_set_params(struct erofs_ublk_dev *dev, + const struct erofs_ublk_dev_info *info) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + struct ublk_params *params = &dev->params; + int ret; + + memset(params, 0, sizeof(*params)); + params->len = sizeof(*params); + params->types = UBLK_PARAM_TYPE_BASIC; + + params->basic.attrs = UBLK_ATTR_READ_ONLY; + params->basic.logical_bs_shift = info->blkbits; + params->basic.physical_bs_shift = info->blkbits; + params->basic.io_opt_shift = info->blkbits; + params->basic.io_min_shift = info->blkbits; + params->basic.max_sectors = info->max_io_buf_bytes >> 9; + params->basic.dev_sectors = info->dev_size >> 9; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + cmd.len = sizeof(*params); + cmd.addr = (__u64)(uintptr_t)params; + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_SET_PARAMS, &cmd); + if (ret < 0) + erofs_err("UBLK_CMD_SET_PARAMS failed: %s", strerror(-ret)); + + return ret; +} + +static int ublk_start_dev(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + cmd.data[0] = getpid(); + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_START_DEV, &cmd); + if (ret < 0) + erofs_err("UBLK_CMD_START_DEV failed: %s", strerror(-ret)); + else + erofs_info("ublk device /dev/ublkb%d started", + dev->dev_info.dev_id); + + return ret; +} + +static int ublk_stop_dev(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + + return ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_STOP_DEV, &cmd); +} + +static int ublk_start_recovery(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_START_USER_RECOVERY, &cmd); + if (ret < 0) + erofs_err("START_USER_RECOVERY failed: %s", strerror(-ret)); + else + erofs_info("ublk device %d recovery started", + dev->dev_info.dev_id); + + return ret; +} + +static int ublk_end_recovery(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + cmd.data[0] = getpid(); + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_END_USER_RECOVERY, &cmd); + if (ret < 0) + erofs_err("END_USER_RECOVERY failed: %s", strerror(-ret)); + else + erofs_info("ublk device %d recovery completed", + dev->dev_info.dev_id); + + return ret; +} + +static int ublk_get_dev_info(struct erofs_ublk_dev *dev, int dev_id) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + cmd.dev_id = dev_id; + cmd.queue_id = (__u16)-1; + cmd.len = sizeof(dev->dev_info); + cmd.addr = (__u64)(uintptr_t)&dev->dev_info; + + ret = ublk_ctrl_cmd(dev->ctrl_fd, UBLK_U_CMD_GET_DEV_INFO, &cmd); + if (ret < 0) + erofs_err("GET_DEV_INFO failed: %s", strerror(-ret)); + + return ret; +} + +static int ublk_get_params(struct erofs_ublk_dev *dev) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ret; + + dev->params.len = sizeof(dev->params); + + cmd.dev_id = dev->dev_info.dev_id; + cmd.queue_id = (__u16)-1; + cmd.len = sizeof(dev->params); + cmd.addr = (__u64)(uintptr_t)&dev->params; + + ret = ublk_dev_ctrl_cmd(dev, UBLK_U_CMD_GET_PARAMS, &cmd); + if (ret < 0) + erofs_err("GET_PARAMS failed: %s", strerror(-ret)); + + return ret; +} + +static int ublk_queue_io_cmd(struct erofs_ublk_queue *q, int tag) +{ + struct io_uring_sqe *sqe; + struct ublksrv_io_cmd *cmd; + struct erofs_ublk_io *io = &q->ios[tag]; + __u32 cmd_op; + + sqe = io_uring_get_sqe(&q->ring); + if (!sqe) + return -ENOMEM; + + if (io->flags & UBLKSRV_IO_FREE) + cmd_op = UBLK_U_IO_FETCH_REQ; + else if (io->flags & UBLKSRV_IO_NEED_GET_DATA) + cmd_op = UBLK_U_IO_NEED_GET_DATA; + else + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; + + memset(sqe, 0, sizeof(*sqe) * 2); + + sqe->opcode = IORING_OP_URING_CMD; + sqe->fd = q->dev->cdev_fd; + sqe->cmd_op = cmd_op; + sqe->user_data = tag; + + cmd = (struct ublksrv_io_cmd *)sqe->cmd; + cmd->q_id = q->q_id; + cmd->tag = tag; + + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) + cmd->result = io->result; + else + cmd->result = 0; + + cmd->addr = 0; + + io->flags = 0; + + return 0; +} + +static int ublk_submit_fetch_commands(struct erofs_ublk_queue *q) +{ + int i, ret; + + for (i = 0; i < q->q_depth; i++) { + q->ios[i].flags = UBLKSRV_IO_FREE; + ret = ublk_queue_io_cmd(q, i); + if (ret < 0) + return ret; + } + + ret = io_uring_submit(&q->ring); + if (ret < 0) { + erofs_err("io_uring_submit (fetch) failed: %s", strerror(-ret)); + return ret; + } + + return 0; +} + +static int ublk_user_copy_read(struct erofs_ublk_queue *q, int tag, size_t len) +{ + __u64 pos = ublk_pos(q->q_id, tag, 0); + void *buf = ublk_get_io_buf(q, tag); + ssize_t ret; + + ret = pread(q->dev->cdev_fd, buf, len, pos); + if (ret < 0) { + erofs_err("user_copy pread failed: %s", strerror(errno)); + return -errno; + } + if ((size_t)ret != len) { + erofs_err("user_copy pread short: %zd/%zu", ret, len); + return -EIO; + } + return 0; +} + +static int ublk_user_copy_write(struct erofs_ublk_queue *q, int tag, size_t len) +{ + __u64 pos = ublk_pos(q->q_id, tag, 0); + void *buf = ublk_get_io_buf(q, tag); + ssize_t ret; + + ret = pwrite(q->dev->cdev_fd, buf, len, pos); + if (ret < 0) { + erofs_err("user_copy pwrite failed: %s", strerror(errno)); + return -errno; + } + if ((size_t)ret != len) { + erofs_err("user_copy pwrite short: %zd/%zu", ret, len); + return -EIO; + } + return 0; +} + +static int ublk_handle_io(struct erofs_ublk_queue *q, int tag) +{ + struct erofs_ublk_dev *dev = q->dev; + const struct ublksrv_io_desc *iod = &q->io_cmd_buf[tag]; + struct erofs_ublk_io *io = &q->ios[tag]; + struct erofs_ublk_request req; + int ret; + + req.op = ublksrv_get_op(iod); + req.start_sector = iod->start_sector; + req.nr_sectors = iod->nr_sectors; + req.buf = ublk_get_io_buf(q, tag); + req.result = 0; + + if (dev->handler) { + ret = dev->handler(dev->handler_ctx, &req); + + if (ret < 0) { + io->result = ret; + } else { + size_t io_bytes = req.nr_sectors << 9; + + if (req.op == UBLK_IO_OP_READ && io_bytes > 0) { + ret = ublk_user_copy_write(q, tag, io_bytes); + if (ret < 0) + io->result = ret; + else + io->result = io_bytes; + } else { + io->result = io_bytes; + } + } + } else { + io->result = -EOPNOTSUPP; + } + + io->flags = UBLKSRV_NEED_COMMIT_RQ_COMP; + + return 0; +} + +static int ublk_handle_cqe(struct erofs_ublk_queue *q, + struct io_uring_cqe *cqe) +{ + int tag = cqe->user_data; + struct erofs_ublk_io *io = &q->ios[tag]; + int ret = cqe->res; + + if (ret == UBLK_IO_RES_ABORT) + return -ENODEV; + + if (ret == UBLK_IO_RES_NEED_GET_DATA) { + const struct ublksrv_io_desc *iod = &q->io_cmd_buf[tag]; + size_t io_bytes = (size_t)iod->nr_sectors << 9; + + ret = ublk_user_copy_read(q, tag, io_bytes); + if (ret < 0) { + io->result = ret; + io->flags = UBLKSRV_NEED_COMMIT_RQ_COMP; + return ublk_queue_io_cmd(q, tag); + } + + io->flags = UBLKSRV_IO_NEED_GET_DATA; + return ublk_queue_io_cmd(q, tag); + } + + if (ret < 0) { + erofs_err("queue %d tag %d IO error: %s", + q->q_id, tag, strerror(-ret)); + io->flags = UBLKSRV_IO_FREE; + return ublk_queue_io_cmd(q, tag); + } + + ublk_handle_io(q, tag); + + return ublk_queue_io_cmd(q, tag); +} + +static void ublk_queue_enter_idle(struct erofs_ublk_queue *q) +{ + if (q->state & UBLKSRV_QUEUE_IDLE) + return; + + q->state |= UBLKSRV_QUEUE_IDLE; + + if (q->io_buf && q->io_buf_size > 0) + madvise(q->io_buf, q->io_buf_size, MADV_DONTNEED); + + erofs_dbg("queue %d entered idle state", q->q_id); +} + +static void ublk_queue_exit_idle(struct erofs_ublk_queue *q) +{ + if (!(q->state & UBLKSRV_QUEUE_IDLE)) + return; + + q->state &= ~UBLKSRV_QUEUE_IDLE; + q->idle_ticks = 0; + erofs_dbg("queue %d exited idle state", q->q_id); +} + +static void *ublk_queue_thread(void *arg) +{ + struct erofs_ublk_queue *q = arg; + struct io_uring_cqe *cqe; + struct __kernel_timespec ts = { + .tv_sec = 0, + .tv_nsec = 100000000, + }; + unsigned int head; + int ret, nr_events; + + if (q->cpuset) + sched_setaffinity(0, sizeof(cpu_set_t), q->cpuset); + + setpriority(PRIO_PROCESS, 0, -20); + + ublk_set_io_flusher(); + + ret = ublk_submit_fetch_commands(q); + if (ret < 0) { + erofs_err("Failed to submit fetch commands: %s", + strerror(-ret)); + if (q->init_barrier) + pthread_barrier_wait(q->init_barrier); + return NULL; + } + + if (q->init_barrier) + pthread_barrier_wait(q->init_barrier); + + erofs_info("queue %d thread started (tid=%d)", q->q_id, gettid()); + + while (!(q->state & UBLKSRV_QUEUE_STOPPING)) { + ret = io_uring_submit_and_wait_timeout(&q->ring, &cqe, 1, + &ts, NULL); + if (ret == -ETIME) { + if (++q->idle_ticks >= UBLK_IDLE_TIMEOUT_TICKS) + ublk_queue_enter_idle(q); + continue; + } + + if (ret < 0) { + if (ret == -EINTR) + continue; + erofs_err("io_uring_submit_and_wait_timeout: %s", + strerror(-ret)); + break; + } + + ublk_queue_exit_idle(q); + + nr_events = 0; + io_uring_for_each_cqe(&q->ring, head, cqe) { + ret = ublk_handle_cqe(q, cqe); + nr_events++; + if (ret == -ENODEV) { + q->state |= UBLKSRV_QUEUE_STOPPING; + break; + } + } + + if (nr_events) + io_uring_cq_advance(&q->ring, nr_events); + + io_uring_submit(&q->ring); + } + + erofs_info("queue %d thread exiting", q->q_id); + __atomic_store_n(&q->dev->stop_requested, 1, __ATOMIC_RELAXED); + if (q->dev->stop_efd >= 0) { + uint64_t val = 1; + + (void)write(q->dev->stop_efd, &val, sizeof(val)); + } + return NULL; +} + +static int ublk_init_queue(struct erofs_ublk_dev *dev, int q_id) +{ + struct erofs_ublk_queue *q = &dev->queues[q_id]; + struct io_uring_params p; + unsigned int cmd_buf_size; + int ret; + + q->q_id = q_id; + q->q_depth = dev->dev_info.queue_depth; + q->dev = dev; + q->state = 0; + q->idle_ticks = 0; + + q->cpuset = malloc(sizeof(cpu_set_t)); + if (q->cpuset) { + CPU_ZERO(q->cpuset); + ret = ublk_get_queue_affinity(dev, q_id, q->cpuset); + if (ret < 0) { + free(q->cpuset); + q->cpuset = NULL; + } + } + + cmd_buf_size = ublk_cmd_buf_sz(q->q_depth); + q->io_cmd_buf = mmap(NULL, cmd_buf_size, PROT_READ, + MAP_SHARED | MAP_POPULATE, dev->cdev_fd, + UBLKSRV_CMD_BUF_OFFSET + + q_id * (UBLK_MAX_QUEUE_DEPTH * + sizeof(struct ublksrv_io_desc))); + if (q->io_cmd_buf == MAP_FAILED) { + erofs_err("mmap io_cmd_buf failed: %s", strerror(errno)); + ret = -errno; + goto err_free_cpuset; + } + + q->ios = calloc(q->q_depth, sizeof(struct erofs_ublk_io)); + if (!q->ios) { + ret = -ENOMEM; + goto err_unmap_cmd; + } + + q->io_buf_size = (size_t)q->q_depth * dev->dev_info.max_io_buf_bytes; + q->io_buf = mmap(NULL, q->io_buf_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (q->io_buf == MAP_FAILED) { + erofs_err("mmap io_buf failed: %s", strerror(errno)); + ret = -errno; + goto err_free_ios; + } + + memset(&p, 0, sizeof(p)); + p.flags = IORING_SETUP_SQE128; + + ret = io_uring_queue_init_params(q->q_depth * 2, &q->ring, &p); + if (ret < 0) { + erofs_err("io_uring_queue_init failed: %s", strerror(-ret)); + goto err_unmap_buf; + } + + return 0; + +err_unmap_buf: + munmap(q->io_buf, q->io_buf_size); +err_free_ios: + free(q->ios); +err_unmap_cmd: + munmap(q->io_cmd_buf, cmd_buf_size); +err_free_cpuset: + free(q->cpuset); + return ret; +} + +static void ublk_cleanup_queue(struct erofs_ublk_dev *dev, int q_id) +{ + struct erofs_ublk_queue *q = &dev->queues[q_id]; + + q->state |= UBLKSRV_QUEUE_STOPPING; + + if (q->thread) { + pthread_join(q->thread, NULL); + q->thread = 0; + } + + io_uring_queue_exit(&q->ring); + + if (q->io_buf && q->io_buf != MAP_FAILED) + munmap(q->io_buf, q->io_buf_size); + + free(q->ios); + + if (q->io_cmd_buf && q->io_cmd_buf != MAP_FAILED) + munmap(q->io_cmd_buf, ublk_cmd_buf_sz(q->q_depth)); + + free(q->cpuset); +} + +static int erofs_ublk_stop(int dev_id) +{ + struct erofs_ublk_dev *dev = ublk_get_dev(dev_id); + int i; + + if (!dev) + return -EINVAL; + + __atomic_store_n(&dev->stop_requested, 1, __ATOMIC_RELAXED); + + if (dev->stop_efd >= 0) { + uint64_t val = 1; + + if (write(dev->stop_efd, &val, sizeof(val)) != sizeof(val)) + erofs_dbg("stop_efd write failed"); + } + + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) + dev->queues[i].state |= UBLKSRV_QUEUE_STOPPING; + + if (__atomic_load_n(&dev->running, __ATOMIC_RELAXED)) + ublk_stop_dev(dev); + + __atomic_store_n(&dev->running, 0, __ATOMIC_RELAXED); + return 0; +} + +static void ublk_sig_handler(int sig) +{ + if (sig == SIGTERM || sig == SIGINT) { + erofs_info("received signal %d, stopping ublk device", + sig); + if (g_sig_dev_id >= 0) + erofs_ublk_stop(g_sig_dev_id); + } +} + +static int ublk_install_sig_handler(int dev_id) +{ + struct sigaction sa; + + g_sig_dev_id = dev_id; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = ublk_sig_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + + if (sigaction(SIGTERM, &sa, NULL) < 0) { + erofs_err("sigaction(SIGTERM) failed: %s", + strerror(errno)); + return -errno; + } + + if (sigaction(SIGINT, &sa, NULL) < 0) { + erofs_err("sigaction(SIGINT) failed: %s", + strerror(errno)); + return -errno; + } + return 0; +} + +int erofs_ublk_init(void) +{ + if (access(UBLK_CTRL_DEV, F_OK) != 0) + return -EOPNOTSUPP; + return 0; +} + +int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info, + erofs_ublk_io_handler_t handler, + void *handler_ctx) +{ + struct erofs_ublk_dev *dev; + struct erofs_ublk_dev_info def_info; + char cdev_path[64]; + int i, ret, dev_id; + + if (!info) { + def_info = (struct erofs_ublk_dev_info) { + .nr_hw_queues = EROFS_UBLK_DEF_NR_HW_QUEUES, + .queue_depth = EROFS_UBLK_DEF_QUEUE_DEPTH, + .max_io_buf_bytes = EROFS_UBLK_DEF_MAX_IO_BUF_BYTES, + .blkbits = EROFS_UBLK_DEF_BLK_BITS, + .dev_id = (u32)-1, + }; + info = &def_info; + } + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return -ENOMEM; + + dev->handler = handler; + dev->handler_ctx = handler_ctx; + dev->ctrl_fd = -1; + dev->cdev_fd = -1; + dev->stop_efd = -1; + + dev->ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR); + if (dev->ctrl_fd < 0) { + ret = -errno; + erofs_err("failed to open %s: %s", + UBLK_CTRL_DEV, strerror(errno)); + goto err_free; + } + + ret = ublk_ctrl_ring_init(&dev->ctrl_ring); + if (ret < 0) { + erofs_err("ctrl ring init failed: %s", + strerror(-ret)); + goto err_close_ctrl; + } + dev->ctrl_ring_initialized = 1; + + dev->stop_efd = eventfd(0, EFD_CLOEXEC); + if (dev->stop_efd < 0) + erofs_dbg("stop eventfd creation failed: %s", + strerror(errno)); + + ret = ublk_add_dev(dev, info); + if (ret < 0) + goto err_close_ctrl; + + snprintf(cdev_path, sizeof(cdev_path), UBLK_CDEV_FMT, + dev->dev_info.dev_id); + dev->cdev_fd = open(cdev_path, O_RDWR); + if (dev->cdev_fd < 0) { + ret = -errno; + erofs_err("failed to open %s: %s", + cdev_path, strerror(errno)); + goto err_del_dev; + } + + ret = ublk_set_params(dev, info); + if (ret < 0) + goto err_close_cdev; + + dev->queues = calloc(dev->dev_info.nr_hw_queues, + sizeof(struct erofs_ublk_queue)); + if (!dev->queues) { + ret = -ENOMEM; + goto err_close_cdev; + } + + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + ret = ublk_init_queue(dev, i); + if (ret < 0) + goto err_cleanup_queues; + } + + dev_id = dev->dev_info.dev_id; + if (dev_id < 0 || dev_id >= UBLK_MAX_DEVS) { + erofs_err("kernel assigned dev_id %d out of range", + dev_id); + ret = -ERANGE; + goto err_cleanup_queues; + } + ublk_devs[dev_id] = dev; + return dev_id; + +err_cleanup_queues: + for (i = i - 1; i >= 0; i--) + ublk_cleanup_queue(dev, i); + free(dev->queues); +err_close_cdev: + close(dev->cdev_fd); +err_del_dev: + ublk_del_dev(dev); +err_close_ctrl: + close(dev->ctrl_fd); +err_free: + free(dev); + return ret; +} + +void erofs_ublk_destroy(int dev_id) +{ + struct erofs_ublk_dev *dev = ublk_get_dev(dev_id); + int i; + + if (!dev) + return; + + erofs_ublk_stop(dev_id); + + if (dev->queues) { + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) + ublk_cleanup_queue(dev, i); + free(dev->queues); + } + + if (dev->cdev_fd >= 0) + close(dev->cdev_fd); + + if (dev->ctrl_fd >= 0) { + ublk_del_dev(dev); + close(dev->ctrl_fd); + } + + if (dev->ctrl_ring_initialized) + io_uring_queue_exit(&dev->ctrl_ring); + + if (dev->stop_efd >= 0) + close(dev->stop_efd); + + ublk_devs[dev_id] = NULL; + free(dev); +} + +int erofs_ublk_start(int dev_id, int ready_fd) +{ + struct erofs_ublk_dev *dev = ublk_get_dev(dev_id); + pthread_barrier_t init_barrier; + int i, ret; + + if (!dev) + return -EINVAL; + + ublk_install_sig_handler(dev_id); + ublk_apply_oom_protection(); + + ret = pthread_barrier_init(&init_barrier, + NULL, dev->dev_info.nr_hw_queues + 1); + if (ret) { + erofs_err("pthread_barrier_init failed: %s", strerror(ret)); + return -ret; + } + + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + dev->queues[i].init_barrier = &init_barrier; + ret = pthread_create(&dev->queues[i].thread, NULL, + ublk_queue_thread, &dev->queues[i]); + if (ret) { + erofs_err("pthread_create failed: %s", strerror(ret)); + goto err_stop_threads; + } + } + + pthread_barrier_wait(&init_barrier); + pthread_barrier_destroy(&init_barrier); + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) + dev->queues[i].init_barrier = NULL; + + if (dev->recovering) + ret = ublk_end_recovery(dev); + else + ret = ublk_start_dev(dev); + if (ret < 0) + goto err_stop_threads; + + __atomic_store_n(&dev->running, 1, __ATOMIC_RELAXED); + + if (!dev->recovering && ready_fd >= 0) { + char ready = 0; + + if (write(ready_fd, &ready, 1) != 1) + erofs_dbg("ready_fd write failed"); + close(ready_fd); + } + erofs_info("ublk device %s successfully", + dev->recovering ? "recovery completed" : "started"); + + if (dev->stop_efd >= 0) { + uint64_t val; + + while (!__atomic_load_n(&dev->stop_requested, __ATOMIC_RELAXED)) { + if (read(dev->stop_efd, &val, sizeof(val)) < 0) { + if (errno == EINTR) + continue; + break; + } + break; + } + } else { + while (!__atomic_load_n(&dev->stop_requested, __ATOMIC_RELAXED)) + usleep(100000); + } + + return 0; + +err_stop_threads: + pthread_barrier_destroy(&init_barrier); + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + dev->queues[i].init_barrier = NULL; + if (dev->queues[i].thread) { + dev->queues[i].state |= UBLKSRV_QUEUE_STOPPING; + pthread_join(dev->queues[i].thread, NULL); + dev->queues[i].thread = 0; + } + } + return ret; +} + +int erofs_ublk_recover_dev(int dev_id, + erofs_ublk_io_handler_t handler, + void *handler_ctx) +{ + struct erofs_ublk_dev *dev; + char cdev_path[64]; + int i, ret; + + if (dev_id < 0) + return -EINVAL; + + dev = calloc(1, sizeof(*dev)); + if (!dev) + return -ENOMEM; + + dev->handler = handler; + dev->handler_ctx = handler_ctx; + dev->ctrl_fd = -1; + dev->cdev_fd = -1; + dev->stop_efd = -1; + + dev->ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR); + if (dev->ctrl_fd < 0) { + ret = -errno; + erofs_err("failed to open %s: %s", + UBLK_CTRL_DEV, strerror(errno)); + goto err_free; + } + + ret = ublk_ctrl_ring_init(&dev->ctrl_ring); + if (ret < 0) { + erofs_err("ctrl ring init failed: %s", + strerror(-ret)); + goto err_close_ctrl; + } + dev->ctrl_ring_initialized = 1; + + dev->stop_efd = eventfd(0, EFD_CLOEXEC); + if (dev->stop_efd < 0) + erofs_dbg("stop eventfd creation failed: %s", + strerror(errno)); + + ret = ublk_get_dev_info(dev, dev_id); + if (ret < 0) + goto err_close_ctrl; + + if (!(dev->dev_info.flags & UBLK_F_USER_RECOVERY)) { + erofs_err("Device %d does not support user recovery", dev_id); + ret = -EOPNOTSUPP; + goto err_close_ctrl; + } + + if (dev->dev_info.state != UBLK_S_DEV_QUIESCED && + dev->dev_info.state != UBLK_S_DEV_FAIL_IO) { + erofs_err("Device %d is not in recoverable state (state=%d)", + dev_id, dev->dev_info.state); + ret = -EBUSY; + goto err_close_ctrl; + } + + ret = ublk_get_params(dev); + if (ret < 0) + goto err_close_ctrl; + + ret = ublk_start_recovery(dev); + if (ret < 0) + goto err_close_ctrl; + + snprintf(cdev_path, sizeof(cdev_path), UBLK_CDEV_FMT, dev_id); + dev->cdev_fd = open(cdev_path, O_RDWR); + if (dev->cdev_fd < 0) { + ret = -errno; + erofs_err("Failed to open %s: %s", cdev_path, strerror(errno)); + goto err_close_ctrl; + } + + dev->queues = calloc(dev->dev_info.nr_hw_queues, + sizeof(struct erofs_ublk_queue)); + if (!dev->queues) { + ret = -ENOMEM; + goto err_close_cdev; + } + + for (i = 0; i < dev->dev_info.nr_hw_queues; i++) { + ret = ublk_init_queue(dev, i); + if (ret < 0) + goto err_cleanup_queues; + } + + dev->recovering = 1; + ublk_devs[dev_id] = dev; + return 0; + +err_cleanup_queues: + for (i = i - 1; i >= 0; i--) + ublk_cleanup_queue(dev, i); + free(dev->queues); +err_close_cdev: + close(dev->cdev_fd); +err_close_ctrl: + close(dev->ctrl_fd); +err_free: + free(dev); + return ret; +} + +int erofs_ublk_is_recoverable(int dev_id) +{ + struct erofs_ublk_dev dev; + int ctrl_fd, ret; + + ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR); + if (ctrl_fd < 0) + return 0; + + memset(&dev, 0, sizeof(dev)); + dev.ctrl_fd = ctrl_fd; + + ret = ublk_get_dev_info(&dev, dev_id); + close(ctrl_fd); + + if (ret < 0) + return 0; + + if ((dev.dev_info.flags & UBLK_F_USER_RECOVERY) && + dev.dev_info.state == UBLK_S_DEV_QUIESCED) + return 1; + + return 0; +} + +int erofs_ublk_del_dev_by_id(int dev_id) +{ + struct ublksrv_ctrl_cmd cmd = {0}; + int ctrl_fd, ret; + + ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR); + if (ctrl_fd < 0) + return -errno; + + cmd.dev_id = dev_id; + cmd.queue_id = (__u16)-1; + + ret = ublk_ctrl_cmd(ctrl_fd, UBLK_U_CMD_STOP_DEV, &cmd); + if (ret < 0 && ret != -ENODEV) + erofs_dbg("STOP_DEV %d: %s", dev_id, strerror(-ret)); + + ret = ublk_ctrl_cmd(ctrl_fd, UBLK_U_CMD_DEL_DEV_ASYNC, &cmd); + close(ctrl_fd); + return ret; +} + +#else /* !HAVE_LIBURING */ + +int erofs_ublk_init(void) +{ + return -EOPNOTSUPP; +} + +int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info, + erofs_ublk_io_handler_t handler, + void *handler_ctx) +{ + (void)info; + (void)handler; + (void)handler_ctx; + return -EOPNOTSUPP; +} + +void erofs_ublk_destroy(int dev_id) +{ + (void)dev_id; +} + +int erofs_ublk_start(int dev_id, int ready_fd) +{ + (void)dev_id; + (void)ready_fd; + return -EOPNOTSUPP; +} + +int erofs_ublk_recover_dev(int dev_id, + erofs_ublk_io_handler_t handler, + void *handler_ctx) +{ + (void)dev_id; + (void)handler; + (void)handler_ctx; + return -EOPNOTSUPP; +} + +int erofs_ublk_is_recoverable(int dev_id) +{ + (void)dev_id; + return 0; +} + +int erofs_ublk_del_dev_by_id(int dev_id) +{ + (void)dev_id; + return -EOPNOTSUPP; +} + +#endif /* HAVE_LIBURING */ diff --git a/lib/liberofs_ublk.h b/lib/liberofs_ublk.h new file mode 100644 index 000000000000..5aef9c3cd977 --- /dev/null +++ b/lib/liberofs_ublk.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */ +/* + * Copyright (C) 2026 Tencent, Inc. + * http://www.tencent.com/ + */ +#ifndef __EROFS_LIB_LIBEROFS_UBLK_H +#define __EROFS_LIB_LIBEROFS_UBLK_H + +#include "erofs/defs.h" + +#define EROFS_UBLK_F_UNPRIVILEGED (1U << 0) +#define EROFS_UBLK_F_USER_RECOVERY (1U << 1) + +#define EROFS_UBLK_OP_READ 0 + +struct erofs_ublk_dev_info { + u16 nr_hw_queues; + u16 queue_depth; + u32 max_io_buf_bytes; + u32 dev_id; + u64 dev_size; + u8 blkbits; + u8 reserved[3]; + u32 flags; +}; + +struct erofs_ublk_request { + u8 op; + u64 start_sector; + u32 nr_sectors; + void *buf; + int result; +}; + +typedef int (*erofs_ublk_io_handler_t)(void *ctx, + struct erofs_ublk_request *req); + +int erofs_ublk_init(void); + +int erofs_ublk_create_dev(const struct erofs_ublk_dev_info *info, + erofs_ublk_io_handler_t handler, + void *handler_ctx); +int erofs_ublk_start(int dev_id, int ready_fd); +void erofs_ublk_destroy(int dev_id); + +int erofs_ublk_del_dev_by_id(int dev_id); + +int erofs_ublk_recover_dev(int dev_id, + erofs_ublk_io_handler_t handler, + void *handler_ctx); +int erofs_ublk_is_recoverable(int dev_id); + +#endif -- 2.43.5
