Currently, only local images are supported (so using the current NBD backend is mostly for functional testing), e.g.:
However, it already uses the vfile interface, so extending it to remote sources should be straightforward. NBD failover is not supported yet and should be implemented via the new netlink interface. Signed-off-by: Gao Xiang <hsiang...@linux.alibaba.com> --- include/erofs/defs.h | 9 ++ include/erofs/io.h | 6 ++ lib/Makefile.am | 4 + lib/backends/nbd.c | 223 +++++++++++++++++++++++++++++++++++++++++++ lib/io.c | 69 ++++++++++++- lib/liberofs_nbd.h | 39 ++++++++ mount/main.c | 156 +++++++++++++++++++++++++++++- 7 files changed, 499 insertions(+), 7 deletions(-) create mode 100644 lib/backends/nbd.c create mode 100644 lib/liberofs_nbd.h diff --git a/include/erofs/defs.h b/include/erofs/defs.h index 0f3e754..8af99ae 100644 --- a/include/erofs/defs.h +++ b/include/erofs/defs.h @@ -88,6 +88,10 @@ typedef int64_t s64; #define le32_to_cpu(x) ((__u32)(x)) #define le64_to_cpu(x) ((__u64)(x)) +#define cpu_to_be32(x) ((__be32)__builtin_bswap32(x)) +#define cpu_to_be64(x) ((__be64)__builtin_bswap64(x)) +#define be32_to_cpu(x) (__builtin_bswap32(x)) +#define be64_to_cpu(x) (__builtin_bswap64(x)) #else #if __BYTE_ORDER == __BIG_ENDIAN #define cpu_to_le16(x) (__builtin_bswap16(x)) @@ -96,6 +100,11 @@ typedef int64_t s64; #define le16_to_cpu(x) (__builtin_bswap16(x)) #define le32_to_cpu(x) (__builtin_bswap32(x)) #define le64_to_cpu(x) (__builtin_bswap64(x)) + +#define cpu_to_be32(x) ((__be32)(x)) +#define cpu_to_be64(x) ((__be64)(x)) +#define be32_to_cpu(x) ((__u32)(x)) +#define be64_to_cpu(x) ((__u64)(x)) #else #pragma error #endif diff --git a/include/erofs/io.h b/include/erofs/io.h index cc7a3cd..370765f 100644 --- a/include/erofs/io.h +++ b/include/erofs/io.h @@ -16,6 +16,7 @@ extern "C" #define _GNU_SOURCE #endif #include <unistd.h> +#include <sys/stat.h> #include <sys/uio.h> #include "defs.h" @@ -36,6 +37,8 @@ struct erofs_vfops { ssize_t (*read)(struct erofs_vfile *vf, void *buf, size_t len); off_t (*lseek)(struct erofs_vfile *vf, u64 offset, int whence); int (*fstat)(struct erofs_vfile *vf, struct stat *buf); + ssize_t (*sendfile)(struct erofs_vfile *vout, struct erofs_vfile *vin, + off_t *pos, size_t count); int (*xcopy)(struct erofs_vfile *vout, off_t pos, struct erofs_vfile *vin, unsigned int len, bool noseek); }; @@ -53,6 +56,7 @@ struct erofs_vfile { }; ssize_t __erofs_io_write(int fd, const void *buf, size_t len); +int __erofs_0write(int fd, size_t len); int erofs_io_fstat(struct erofs_vfile *vf, struct stat *buf); ssize_t erofs_io_pwrite(struct erofs_vfile *vf, const void *buf, u64 pos, size_t len); @@ -67,6 +71,8 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence); ssize_t erofs_copy_file_range(int fd_in, u64 *off_in, int fd_out, u64 *off_out, size_t length); +ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin, + off_t *pos, size_t count); int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos, struct erofs_vfile *vin, unsigned int len, bool noseek); diff --git a/lib/Makefile.am b/lib/Makefile.am index 955495d..4f8e767 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -31,6 +31,7 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \ $(top_srcdir)/lib/liberofs_private.h \ $(top_srcdir)/lib/liberofs_xxhash.h \ $(top_srcdir)/lib/liberofs_metabox.h \ + $(top_srcdir)/lib/liberofs_nbd.h \ $(top_srcdir)/lib/liberofs_s3.h noinst_HEADERS += compressor.h @@ -76,3 +77,6 @@ if ENABLE_EROFS_MT liberofs_la_LDFLAGS = -lpthread liberofs_la_SOURCES += workqueue.c endif +if OS_LINUX +liberofs_la_SOURCES += backends/nbd.c +endif diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c new file mode 100644 index 0000000..398a1e9 --- /dev/null +++ b/lib/backends/nbd.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 +/* + * Copyright (C) 2025 Alibaba Cloud + */ +#include <errno.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/un.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <dirent.h> +#include <unistd.h> +#include <stdlib.h> +#include "erofs/io.h" +#include "erofs/err.h" +#include "erofs/print.h" +#include "liberofs_nbd.h" + +#define NBD_SET_SOCK _IO( 0xab, 0 ) +#define NBD_SET_BLKSIZE _IO( 0xab, 1 ) +#define NBD_DO_IT _IO( 0xab, 3 ) +#define NBD_CLEAR_SOCK _IO( 0xab, 4 ) +#define NBD_SET_SIZE_BLOCKS _IO( 0xab, 7 ) +#define NBD_SET_TIMEOUT _IO( 0xab, 9 ) +#define NBD_SET_FLAGS _IO( 0xab, 10) + +#define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_REPLY_MAGIC 0x67446698 + +#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ + +/* + * This is the reply packet that nbd-server sends back to the client after + * it has completed an I/O request (or an error occurs). + */ +struct nbd_reply { + __be32 magic; /* NBD_REPLY_MAGIC */ + __be32 error; /* 0 = ok, else error */ + union { + __be64 cookie; /* Opaque identifier from request */ + char handle[8]; /* older spelling of cookie */ + }; +} __packed; + +long erofs_nbd_in_service(int nbdnum) +{ + int fd, err; + char s[32]; + + (void)snprintf(s, sizeof(s), "/sys/block/nbd%d/size", nbdnum); + fd = open(s, O_RDONLY); + if (fd < 0) + return -errno; + err = read(fd, s, sizeof(s)); + if (err < 0) { + err = -errno; + close(fd); + return err; + } + close(fd); + if (!memcmp(s, "0\n", sizeof("0\n") - 1)) + return -ENOTCONN; + + (void)snprintf(s, sizeof(s), "/sys/block/nbd%d/pid", nbdnum); + fd = open(s, O_RDONLY); + if (fd < 0) + return -errno; + err = read(fd, s, sizeof(s)); + if (err < 0) { + err = -errno; + close(fd); + return err; + } + close(fd); + return strtol(s, NULL, 10); +} + +int erofs_nbd_devscan(void) +{ + DIR *_dir; + int err; + + _dir = opendir("/sys/block"); + if (!_dir) { + fprintf(stderr, "failed to opendir /sys/block: %s\n", + strerror(errno)); + return -errno; + } + + while (1) { + struct dirent *dp; + char path[64]; + + /* + * set errno to 0 before calling readdir() in order to + * distinguish end of stream and from an error. + */ + errno = 0; + dp = readdir(_dir); + if (!dp) { + if (errno) + err = -errno; + else + err = -EBUSY; + break; + } + + if (strncmp(dp->d_name, "nbd", 3)) + continue; + + /* Skip nbdX with valid `pid` or `backend` */ + err = snprintf(path, sizeof(path), "%s/pid", dp->d_name); + if (err < 0) + continue; + if (!faccessat(dirfd(_dir), path, F_OK, 0)) + continue; + err = snprintf(path, sizeof(path), "%s/backend", dp->d_name); + if (err < 0) + continue; + if (!faccessat(dirfd(_dir), path, F_OK, 0)) + continue; + err = atoi(dp->d_name + 3); + break; + } + closedir(_dir); + return err; +} + +int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks) +{ + int sv[2], err; + + err = socketpair(AF_UNIX, SOCK_STREAM, 0, sv); + if (err < 0) + return -errno; + + err = ioctl(nbdfd, NBD_CLEAR_SOCK, 0); + if (err < 0) + goto err_out; + + err = ioctl(nbdfd, NBD_SET_BLKSIZE, 1U << blkbits); + if (err < 0) + goto err_out; + + err = ioctl(nbdfd, NBD_SET_SIZE_BLOCKS, blocks); + if (err < 0) + goto err_out; + + err = ioctl(nbdfd, NBD_SET_TIMEOUT, 0); + if (err < 0) + goto err_out; + + err = ioctl(nbdfd, NBD_SET_FLAGS, NBD_FLAG_READ_ONLY); + if (err < 0) + goto err_out; + + err = ioctl(nbdfd, NBD_SET_SOCK, sv[1]); + if (err < 0) + goto err_out; + return sv[0]; +err_out: + close(sv[0]); + close(sv[1]); + return err; +} + +int erofs_nbd_do_it(int nbdfd) +{ + int err; + + err = ioctl(nbdfd, NBD_DO_IT, 0); + if (err < 0) { + if (errno == EPIPE) + /* + * `ioctl(NBD_DO_IT)` normally returns EPIPE when someone has + * disconnected the socket via NBD_DISCONNECT. We do not want + * to return 1 in that case. + */ + err = 0; + else + err = -errno; + } + if (err) + erofs_err("NBD_DO_IT ends with %s", erofs_strerror(err)); + close(nbdfd); + return err; +} + +int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq) +{ + struct erofs_vfile vf = { .fd = skfd }; + int err; + + err = erofs_io_read(&vf, rq, sizeof(*rq)); + if (err < sizeof(*rq)) + return -EPIPE; + + if (rq->magic != cpu_to_be32(NBD_REQUEST_MAGIC)) + return -EIO; + + rq->type = be32_to_cpu((__be32)rq->type); + rq->from = be64_to_cpu((__be64)rq->from); + rq->len = be32_to_cpu((__be32)rq->len); + return 0; +} + +int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err) +{ + struct nbd_reply reply = { + .magic = cpu_to_be32(NBD_REPLY_MAGIC), + .error = cpu_to_be32(err), + .cookie = cookie, + }; + int ret; + + ret = write(skfd, &reply, sizeof(reply)); + if (ret == sizeof(reply)) + return 0; + return ret < 0 ? -errno : -EIO; +} diff --git a/lib/io.c b/lib/io.c index b91c93c..ff3b794 100644 --- a/lib/io.c +++ b/lib/io.c @@ -147,10 +147,29 @@ int erofs_io_fsync(struct erofs_vfile *vf) return 0; } +static const char erofs_zeroed[EROFS_MAX_BLOCK_SIZE]; + +int __erofs_0write(int fd, size_t len) +{ + int err = 0; + + while (len) { + u32 count = min_t(u64, sizeof(erofs_zeroed), len); + + err = write(fd, erofs_zeroed, count); + if (err <= 0) { + if (err < 0) + err = -errno; + break; + } + len -= err; + } + return err < 0 ? err : len; +} + int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset, size_t len, bool zeroout) { - static const char zero[EROFS_MAX_BLOCK_SIZE] = {0}; ssize_t ret; if (__erofs_unlikely(cfg.c_dry_run)) @@ -164,14 +183,15 @@ int erofs_io_fallocate(struct erofs_vfile *vf, u64 offset, FALLOC_FL_KEEP_SIZE, offset + vf->offset, len) >= 0) return 0; #endif - while (len > EROFS_MAX_BLOCK_SIZE) { - ret = erofs_io_pwrite(vf, zero, offset, EROFS_MAX_BLOCK_SIZE); + while (len > sizeof(erofs_zeroed)) { + ret = erofs_io_pwrite(vf, erofs_zeroed, offset, + sizeof(erofs_zeroed)); if (ret < 0) return (int)ret; len -= ret; offset += ret; } - return erofs_io_pwrite(vf, zero, offset, len) == len ? 0 : -EIO; + return erofs_io_pwrite(vf, erofs_zeroed, offset, len) == len ? 0 : -EIO; } int erofs_io_ftruncate(struct erofs_vfile *vf, u64 length) @@ -551,6 +571,47 @@ off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence) return lseek(vf->fd, offset, whence); } +ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin, + off_t *pos, size_t count) +{ + ssize_t written; + + if (vin->ops || vout->ops) { + if (vin->ops) + return vin->ops->sendfile(vout, vin, pos, count); + return vout->ops->sendfile(vout, vin, pos, count); + } +#if defined(HAVE_SYS_SENDFILE_H) && defined(HAVE_SENDFILE) + do { + written = sendfile(vout->fd, vin->fd, pos, count); + if (written <= 0) { + if (written < 0) { + written = -errno; + if (written == -EOVERFLOW && pos) + written = 0; + } + break; + } + count -= written; + } while (written); +#endif + while (count) { + char buf[EROFS_MAX_BLOCK_SIZE]; + + written = min_t(u64, count, sizeof(buf)); + if (pos) + written = erofs_io_pread(vin, buf, written, *pos); + else + written = erofs_io_read(vin, buf, written); + if (written <= 0) + break; + count -= written; + if (pos) + *pos += written; + } + return written < 0 ? written : count; +} + int erofs_io_xcopy(struct erofs_vfile *vout, off_t pos, struct erofs_vfile *vin, unsigned int len, bool noseek) { diff --git a/lib/liberofs_nbd.h b/lib/liberofs_nbd.h new file mode 100644 index 0000000..6660df1 --- /dev/null +++ b/lib/liberofs_nbd.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */ +/* + * Copyright (C) 2025 Alibaba Cloud + */ +#ifndef __EROFS_LIB_LIBEROFS_NBD_H +#define __EROFS_LIB_LIBEROFS_NBD_H + +#include "erofs/defs.h" + +/* Supported request types */ +enum { + EROFS_NBD_CMD_READ = 0, + EROFS_NBD_CMD_WRITE = 1, + EROFS_NBD_CMD_DISC = 2, + EROFS_NBD_CMD_FLUSH = 3, + EROFS_NBD_CMD_TRIM = 4, + /* userspace defines additional extension commands */ + EROFS_NBD_CMD_WRITE_ZEROES = 6, +}; + +struct erofs_nbd_request { + __be32 magic; /* NBD_REQUEST_MAGIC */ + u32 type; /* See NBD_CMD_* */ + union { + __be64 cookie; /* Opaque identifier for request */ + char handle[8]; /* older spelling of cookie */ + }; + u64 from; + u32 len; +} __packed; + +long erofs_nbd_in_service(int nbdnum); +int erofs_nbd_devscan(void); +int erofs_nbd_connect(int nbdfd, int blkbits, u64 blocks); +int erofs_nbd_do_it(int nbdfd); +int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq); +int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err); + +#endif diff --git a/mount/main.c b/mount/main.c index 0f7538a..9cb203f 100644 --- a/mount/main.c +++ b/mount/main.c @@ -6,10 +6,13 @@ #include <stdlib.h> #include <string.h> #include <sys/mount.h> +#include <pthread.h> #include <unistd.h> #include "erofs/config.h" #include "erofs/print.h" #include "erofs/err.h" +#include "erofs/io.h" +#include "../lib/liberofs_nbd.h" #ifdef HAVE_LINUX_LOOP_H #include <linux/loop.h> #else @@ -30,6 +33,7 @@ enum erofs_backend_drv { EROFSAUTO, EROFSLOCAL, EROFSFUSE, + EROFSNBD, }; static struct erofsmount_cfg { @@ -132,6 +136,8 @@ static int erofsmount_parse_options(int argc, char **argv) mountcfg.backend = EROFSFUSE; } else if (!strcmp(dot + 1, "local")) { mountcfg.backend = EROFSLOCAL; + } else if (!strcmp(dot + 1, "nbd")) { + mountcfg.backend = EROFSNBD; } else { erofs_err("invalid filesystem subtype `%s`", dot + 1); return -EINVAL; @@ -196,11 +202,148 @@ static int erofsmount_fuse(const char *source, const char *mountpoint, return 0; } +struct erofsmount_nbd_ctx { + struct erofs_vfile vd; /* virtual device */ + struct erofs_vfile sk; /* socket file */ +}; + +static void *erofsmount_nbd_loopfn(void *arg) +{ + struct erofsmount_nbd_ctx *ctx = arg; + int err; + + while (1) { + struct erofs_nbd_request rq; + ssize_t rem; + off_t pos; + + err = erofs_nbd_get_request(ctx->sk.fd, &rq); + if (err < 0) { + if (err == -EPIPE) + err = 0; + break; + } + + if (rq.type != EROFS_NBD_CMD_READ) { + err = erofs_nbd_send_reply_header(ctx->sk.fd, + rq.cookie, -EIO); + if (err) + break; + } + + erofs_nbd_send_reply_header(ctx->sk.fd, rq.cookie, 0); + pos = rq.from; + rem = erofs_io_sendfile(&ctx->sk, &ctx->vd, &pos, rq.len); + if (rem < 0) { + err = -errno; + break; + } + err = __erofs_0write(ctx->sk.fd, rem); + if (err) { + if (err > 0) + err = -EIO; + break; + } + } + close(ctx->vd.fd); + close(ctx->sk.fd); + return (void *)(uintptr_t)err; +} + +static int erofsmount_startnbd(int nbdfd, const char *source) +{ + struct erofsmount_nbd_ctx ctx = {}; + uintptr_t retcode; + pthread_t th; + int err, err2; + + err = open(source, O_RDONLY); + if (err < 0) { + err = -errno; + goto out_closefd; + } + ctx.vd.fd = err; + + err = erofs_nbd_connect(nbdfd, 9, INT64_MAX >> 9); + if (err < 0) { + close(ctx.vd.fd); + goto out_closefd; + } + ctx.sk.fd = err; + + err = -pthread_create(&th, NULL, erofsmount_nbd_loopfn, &ctx); + if (err) { + close(ctx.vd.fd); + close(ctx.sk.fd); + goto out_closefd; + } + + err = erofs_nbd_do_it(nbdfd); + err2 = -pthread_join(th, (void **)&retcode); + if (!err2 && retcode) { + erofs_err("NBD worker failed with %s", + erofs_strerror(retcode)); + err2 = retcode; + } + return err ?: err2; +out_closefd: + close(nbdfd); + return err; +} + +static int erofsmount_nbd(const char *source, const char *mountpoint, + const char *fstype, int flags, + const char *options) +{ + char nbdpath[32]; + int num, nbdfd; + pid_t pid; + long err; + + if (strcmp(fstype, "erofs")) { + fprintf(stderr, "unsupported filesystem type `%s`\n", + mountcfg.fstype); + return -ENODEV; + } + flags |= MS_RDONLY; + + num = erofs_nbd_devscan(); + if (num < 0) + return num; + + (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); + nbdfd = open(nbdpath, O_RDWR); + if (nbdfd < 0) + return -errno; + + if ((pid = fork()) == 0) + return erofsmount_startnbd(nbdfd, source) ? + EXIT_FAILURE : EXIT_SUCCESS; + close(nbdfd); + + while (1) { + err = erofs_nbd_in_service(num); + if (err == -ENOENT || err == -ENOTCONN) { + usleep(50000); + continue; + } + if (err >= 0) + err = (err != pid ? -EBUSY : 0); + break; + } + if (!err) { + err = mount(nbdpath, mountpoint, fstype, flags, options); + if (err < 0) + err = -errno; + } + return err; +} + #define EROFSMOUNT_LOOPDEV_RETRIES 3 -int erofsmount_loopmount(const char *source, const char *mountpoint, - const char *fstype, int flags, - const char *options) +static int erofsmount_loopmount(const char *source, const char *mountpoint, + const char *fstype, int flags, + const char *options) { int fd, dfd, num; struct loop_info li = {}; @@ -269,6 +412,13 @@ int main(int argc, char *argv[]) goto exit; } + if (mountcfg.backend == EROFSNBD) { + err = erofsmount_nbd(mountcfg.device, mountcfg.mountpoint, + mountcfg.fstype, mountcfg.flags, + mountcfg.options); + goto exit; + } + err = mount(mountcfg.device, mountcfg.mountpoint, mountcfg.fstype, mountcfg.flags, mountcfg.options); if (err < 0) -- 2.43.0