It allows directory data and inode metadata to be kept as close
as possible, significantly improving metadata performance for
long-latency remote image use cases.

Usage:
 $ mkfs.erofs --MZ foo.erofs foo/

Signed-off-by: Gao Xiang <[email protected]>
---
 include/erofs/importer.h |  1 +
 include/erofs/inode.h    |  3 +-
 include/erofs/internal.h |  8 ++++
 lib/cache.c              |  4 ++
 lib/inode.c              | 98 ++++++++++++++++++++++++++++------------
 lib/io.c                 |  5 +-
 lib/metabox.c            | 22 +++++----
 lib/remotes/s3.c         |  3 +-
 lib/super.c              |  4 +-
 lib/tar.c                |  2 +-
 mkfs/main.c              | 27 +++++++++--
 11 files changed, 129 insertions(+), 48 deletions(-)

diff --git a/include/erofs/importer.h b/include/erofs/importer.h
index a525b474f1d5..60160d6bea05 100644
--- a/include/erofs/importer.h
+++ b/include/erofs/importer.h
@@ -46,6 +46,7 @@ struct erofs_importer_params {
        bool no_datainline;
        /* Issue directory data (except inline data) separately from regular 
inodes */
        bool grouped_dirdata;
+       bool dirdata_in_metazone;
        bool hard_dereference;
        bool ovlfs_strip;
        bool dot_omitted;
diff --git a/include/erofs/inode.h b/include/erofs/inode.h
index 89bd16aecc06..ba62ece9a7cc 100644
--- a/include/erofs/inode.h
+++ b/include/erofs/inode.h
@@ -38,7 +38,8 @@ erofs_nid_t erofs_lookupnid(struct erofs_inode *inode);
 int erofs_iflush(struct erofs_inode *inode);
 struct erofs_dentry *erofs_d_alloc(struct erofs_inode *parent,
                                   const char *name);
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t 
nblocks);
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t 
nblocks,
+                                bool in_metazone);
 bool erofs_dentry_is_wht(struct erofs_sb_info *sbi, struct erofs_dentry *d);
 int __erofs_fill_inode(struct erofs_importer *im, struct erofs_inode *inode,
                       struct stat *st, const char *path);
diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 5798f10e89c2..2fe4514b3d23 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -157,6 +157,7 @@ struct erofs_sb_info {
        struct erofs_buffer_head *bh_devt;
        bool useqpl;
        bool sb_valid;
+       u32 metazone_startblk;
 };
 
 /* make sure that any user of the erofs headers has atleast 64bit off_t type */
@@ -205,6 +206,8 @@ struct erofs_diskbuf;
 #define EROFS_INODE_DATA_SOURCE_DISKBUF                2
 #define EROFS_INODE_DATA_SOURCE_RESVSP         3
 
+#define EROFS_I_BLKADDR_DEV_ID_BIT             48
+
 struct erofs_inode {
        struct list_head i_hash, i_subdirs, i_xattrs;
 
@@ -308,6 +311,11 @@ static inline bool erofs_inode_in_metabox(struct 
erofs_inode *inode)
        return inode->nid >> EROFS_DIRENT_NID_METABOX_BIT;
 }
 
+static inline erofs_blk_t erofs_inode_dev_baddr(struct erofs_inode *inode)
+{
+       return inode->u.i_blkaddr & (BIT(EROFS_I_BLKADDR_DEV_ID_BIT) - 1);
+}
+
 static inline erofs_off_t erofs_iloc(struct erofs_inode *inode)
 {
        struct erofs_sb_info *sbi = inode->sbi;
diff --git a/lib/cache.c b/lib/cache.c
index a87575ad74d1..f23dbb06264a 100644
--- a/lib/cache.c
+++ b/lib/cache.c
@@ -479,6 +479,10 @@ static int __erofs_bflush(struct erofs_bufmgr *bmgr,
 
                        /* flush and remove bh */
                        ret = bh->op->flush(bh);
+                       if (__erofs_unlikely(ret == -EBUSY && !forget)) {
+                               skip = true;
+                               continue;
+                       }
                        if (ret < 0)
                                return ret;
                }
diff --git a/lib/inode.c b/lib/inode.c
index e44e03cf460f..88dc41b19e5b 100644
--- a/lib/inode.c
+++ b/lib/inode.c
@@ -194,9 +194,12 @@ struct erofs_dentry *erofs_d_alloc(struct erofs_inode 
*parent,
 }
 
 /* allocate main data for an inode */
-int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t 
nblocks)
+int erofs_allocate_inode_bh_data(struct erofs_inode *inode, erofs_blk_t 
nblocks,
+                                bool in_metazone)
 {
-       struct erofs_bufmgr *bmgr = inode->sbi->bmgr;
+       struct erofs_sb_info *sbi = inode->sbi;
+       struct erofs_bufmgr *bmgr = in_metazone ?
+               erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
        struct erofs_buffer_head *bh;
        int ret, type;
 
@@ -206,9 +209,15 @@ int erofs_allocate_inode_bh_data(struct erofs_inode 
*inode, erofs_blk_t nblocks)
                return 0;
        }
 
+       if (in_metazone && !bmgr) {
+               erofs_err("cannot allocate data in the metazone when 
unavailable for %s",
+                         inode->i_srcpath);
+               return -EINVAL;
+       }
+
        /* allocate main data buffer */
        type = S_ISDIR(inode->i_mode) ? DIRA : DATA;
-       bh = erofs_balloc(bmgr, type, erofs_pos(inode->sbi, nblocks), 0);
+       bh = erofs_balloc(bmgr, type, erofs_pos(sbi, nblocks), 0);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
 
@@ -220,7 +229,8 @@ int erofs_allocate_inode_bh_data(struct erofs_inode *inode, 
erofs_blk_t nblocks)
        DBG_BUGON(ret < 0);
 
        /* write blocks except for the tail-end block */
-       inode->u.i_blkaddr = bh->block->blkaddr;
+       inode->u.i_blkaddr = bh->block->blkaddr | (in_metazone ?
+               (sbi->extra_devices + 1ULL) << EROFS_I_BLKADDR_DEV_ID_BIT : 0);
        return 0;
 }
 
@@ -591,7 +601,7 @@ int erofs_write_file_from_buffer(struct erofs_inode *inode, 
char *buf)
 
        inode->datalayout = EROFS_INODE_FLAT_INLINE;
 
-       ret = erofs_allocate_inode_bh_data(inode, nblocks);
+       ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
        if (ret)
                return ret;
 
@@ -622,16 +632,17 @@ static bool erofs_file_is_compressible(struct 
erofs_importer *im,
 
 static int erofs_write_unencoded_data(struct erofs_inode *inode,
                                      struct erofs_vfile *vf, erofs_off_t fpos,
-                                     bool noseek)
+                                     bool noseek, bool in_metazone)
 {
        struct erofs_sb_info *sbi = inode->sbi;
-       erofs_blk_t nblocks, i;
+       struct erofs_bufmgr *bmgr;
+       erofs_off_t remaining, pos;
        unsigned int len;
        int ret;
 
        if (!noseek && erofs_sb_has_48bit(sbi)) {
-               if (erofs_io_lseek(vf, fpos, SEEK_DATA) < 0 && errno == ENXIO) {
-                       ret = erofs_allocate_inode_bh_data(inode, 0);
+               if (erofs_io_lseek(vf, fpos, SEEK_DATA) == -ENXIO) {
+                       ret = erofs_allocate_inode_bh_data(inode, 0, false);
                        if (ret)
                                return ret;
                        inode->datalayout = EROFS_INODE_FLAT_PLAIN;
@@ -640,27 +651,31 @@ static int erofs_write_unencoded_data(struct erofs_inode 
*inode,
                ret = erofs_io_lseek(vf, fpos, SEEK_SET);
                if (ret < 0)
                        return ret;
-               else if (ret != fpos)
+               if (ret != fpos)
                        return -EIO;
        }
 
-       nblocks = inode->i_size >> sbi->blkszbits;
-       ret = erofs_allocate_inode_bh_data(inode, nblocks);
+       inode->idata_size = inode->i_size % erofs_blksiz(sbi);
+       remaining = inode->i_size - inode->idata_size;
+
+       ret = erofs_allocate_inode_bh_data(inode, remaining >> sbi->blkszbits,
+                                          in_metazone);
        if (ret)
                return ret;
 
-       for (i = 0; i < nblocks; i += (len >> sbi->blkszbits)) {
+       bmgr = in_metazone ? erofs_metadata_bmgr(sbi, false) : sbi->bmgr;
+       pos = erofs_pos(sbi, erofs_inode_dev_baddr(inode));
+       while (remaining) {
                len = min_t(u64, round_down(UINT_MAX, 1U << sbi->blkszbits),
-                           erofs_pos(sbi, nblocks - i));
-               ret = erofs_io_xcopy(&sbi->bdev,
-                                    erofs_pos(sbi, inode->u.i_blkaddr + i),
-                                    vf, len, noseek);
+                           remaining);
+               ret = erofs_io_xcopy(bmgr->vf, pos, vf, len, noseek);
                if (ret)
                        return ret;
+               pos += len;
+               remaining -= len;
        }
 
        /* read the tail-end data */
-       inode->idata_size = inode->i_size % erofs_blksiz(sbi);
        if (inode->idata_size) {
                inode->idata = malloc(inode->idata_size);
                if (!inode->idata)
@@ -691,10 +706,11 @@ int erofs_write_unencoded_file(struct erofs_inode *inode, 
int fd, u64 fpos)
        /* fallback to all data uncompressed */
        return erofs_write_unencoded_data(inode,
                        &(struct erofs_vfile){ .fd = fd }, fpos,
-                       inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+                       inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF, 
false);
 }
 
-static int erofs_write_dir_file(struct erofs_inode *dir)
+static int erofs_write_dir_file(const struct erofs_importer *im,
+                               struct erofs_inode *dir)
 {
        unsigned int bsz = erofs_blksiz(dir->sbi);
        struct erofs_vfile *vf;
@@ -708,7 +724,8 @@ static int erofs_write_dir_file(struct erofs_inode *dir)
                err = erofs_write_compress_dir(dir, vf);
        } else {
                DBG_BUGON(dir->idata_size != (dir->i_size & (bsz - 1)));
-               err = erofs_write_unencoded_data(dir, vf, 0, true);
+               err = erofs_write_unencoded_data(dir, vf, 0, true,
+                                       im->params->dirdata_in_metazone);
        }
        erofs_io_close(vf);
        return err;
@@ -732,19 +749,39 @@ int erofs_iflush(struct erofs_inode *inode)
        struct iovec iov[2];
        char *xattrs = NULL;
        bool nlink_1 = true;
-       int ret, fmt;
+       int ret, fmt, dev_id;
 
        DBG_BUGON(bh && erofs_btell(bh, false) != off);
-
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
-           S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
+           S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                u1.rdev = cpu_to_le32(inode->u.i_rdev);
-       else if (is_inode_layout_compression(inode))
+       } else if (is_inode_layout_compression(inode)) {
                u1.blocks_lo = cpu_to_le32(inode->u.i_blocks);
-       else if (inode->datalayout == EROFS_INODE_CHUNK_BASED)
+       } else if (inode->datalayout == EROFS_INODE_CHUNK_BASED) {
                u1.c.format = cpu_to_le16(inode->u.chunkformat);
-       else
+       } else {
+               if (inode->u.i_blkaddr != EROFS_NULL_ADDR) {
+                       dev_id = inode->u.i_blkaddr >> 
EROFS_I_BLKADDR_DEV_ID_BIT;
+
+                       if (dev_id) {
+                               if (dev_id <= sbi->extra_devices) {
+                                       if (!sbi->devs[dev_id - 1].uniaddr) {
+                                               DBG_BUGON(1);   /* impossible 
now */
+                                               return -EBUSY;
+                                       }
+                                       inode->u.i_blkaddr += sbi->devs[dev_id 
- 1].uniaddr;
+                               } else {
+                                       if (sbi->metazone_startblk == 
EROFS_META_NEW_ADDR) {
+                                               DBG_BUGON(1);   /* impossible 
now */
+                                               return -EBUSY;
+                                       }
+                                       DBG_BUGON(dev_id != sbi->extra_devices 
+ 1);
+                                       inode->u.i_blkaddr += 
sbi->metazone_startblk;
+                               }
+                       }
+               }
                u1.startblk_lo = cpu_to_le32(inode->u.i_blkaddr);
+       }
 
        if (is_inode_layout_compression(inode) &&
            inode->u.i_blocks > UINT32_MAX) {
@@ -894,7 +931,7 @@ static bool erofs_inode_need_48bit(struct erofs_inode 
*inode)
                        return true;
        } else if (!is_inode_layout_compression(inode)) {
                if (inode->u.i_blkaddr != EROFS_NULL_ADDR &&
-                   inode->u.i_blkaddr > UINT32_MAX)
+                   erofs_inode_dev_baddr(inode) > UINT32_MAX)
                        return true;
        }
        return false;
@@ -1564,7 +1601,7 @@ static int erofs_mkfs_jobfn(const struct erofs_mkfs_btctx 
*ctx,
                return erofs_mkfs_create_directory(ctx, inode);
 
        if (item->type == EROFS_MKFS_JOB_DIR_BH) {
-               ret = erofs_write_dir_file(inode);
+               ret = erofs_write_dir_file(ctx->im, inode);
                if (ret)
                        return ret;
                erofs_write_tail_end(inode);
@@ -2313,7 +2350,8 @@ struct erofs_inode 
*erofs_mkfs_build_special_from_fd(struct erofs_importer *im,
        inode->datalayout = EROFS_INODE_FLAT_INLINE;
        ret = erofs_write_unencoded_data(inode,
                        &(struct erofs_vfile){ .fd = fd }, 0,
-                       inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF);
+                       inode->datasource == EROFS_INODE_DATA_SOURCE_DISKBUF,
+                       false);
        if (ret)
                return ERR_PTR(ret);
 out:
diff --git a/lib/io.c b/lib/io.c
index 37a74f63c45e..0c5eb2c29989 100644
--- a/lib/io.c
+++ b/lib/io.c
@@ -571,10 +571,13 @@ ssize_t erofs_io_write(struct erofs_vfile *vf, void *buf, 
size_t len)
 
 off_t erofs_io_lseek(struct erofs_vfile *vf, u64 offset, int whence)
 {
+       off_t ret;
+
        if (vf->ops)
                return vf->ops->lseek(vf, offset, whence);
 
-       return lseek(vf->fd, offset, whence);
+       ret = lseek(vf->fd, offset, whence);
+       return ret < 0 ? -errno : ret;
 }
 
 ssize_t erofs_io_sendfile(struct erofs_vfile *vout, struct erofs_vfile *vin,
diff --git a/lib/metabox.c b/lib/metabox.c
index 37267ddb73cf..d6abd5123cc8 100644
--- a/lib/metabox.c
+++ b/lib/metabox.c
@@ -54,7 +54,7 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
        struct erofs_metamgr *m2gr;
        int ret;
 
-       if (!sbi->m2gr && sbi->meta_blkaddr == EROFS_META_NEW_ADDR) {
+       if (!sbi->m2gr && sbi->metazone_startblk == EROFS_META_NEW_ADDR) {
                m2gr = malloc(sizeof(*m2gr));
                if (!m2gr)
                        return -ENOMEM;
@@ -62,6 +62,8 @@ int erofs_metadata_init(struct erofs_sb_info *sbi)
                if (ret)
                        goto err_free;
                sbi->m2gr = m2gr;
+               /* FIXME: sbi->meta_blkaddr should be 0 for 48-bit layouts */
+               sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
        }
 
        if (!sbi->mxgr && erofs_sb_has_metabox(sbi)) {
@@ -124,20 +126,24 @@ int erofs_metazone_flush(struct erofs_sb_info *sbi)
 
        if (!m2gr)
                return 0;
-       m2bgr = m2gr->bmgr;
+       bh = erofs_balloc(sbi->bmgr, DATA, 0, 0);
+       if (!bh)
+               return PTR_ERR(bh);
+       erofs_mapbh(NULL, bh->block);
+       pos_out = erofs_btell(bh, false);
+       meta_blkaddr = pos_out >> sbi->blkszbits;
+       sbi->metazone_startblk = meta_blkaddr;
 
+       m2bgr = m2gr->bmgr;
        ret = erofs_bflush(m2bgr, NULL);
        if (ret)
                return ret;
 
        length = erofs_mapbh(m2bgr, NULL) << sbi->blkszbits;
-       bh = erofs_balloc(sbi->bmgr, DATA, length, 0);
-       if (!bh)
-               return PTR_ERR(bh);
+       ret = erofs_bh_balloon(bh, length);
+       if (ret < 0)
+               return ret;
 
-       erofs_mapbh(NULL, bh->block);
-       pos_out = erofs_btell(bh, false);
-       meta_blkaddr = pos_out >> sbi->blkszbits;
        do {
                count = min_t(erofs_off_t, length, INT_MAX);
                ret = erofs_io_xcopy(sbi->bmgr->vf, pos_out,
diff --git a/lib/remotes/s3.c b/lib/remotes/s3.c
index 223c3e89d6fd..b0ca84b51afc 100644
--- a/lib/remotes/s3.c
+++ b/lib/remotes/s3.c
@@ -1032,7 +1032,8 @@ static int s3erofs_remote_getobject(struct erofs_importer 
*im,
                inode->datalayout = EROFS_INODE_FLAT_PLAIN;
                inode->idata_size = 0;
                ret = erofs_allocate_inode_bh_data(inode,
-                               DIV_ROUND_UP(inode->i_size, 1U << 
sbi->blkszbits));
+                               DIV_ROUND_UP(inode->i_size, 1U << 
sbi->blkszbits),
+                               false);
                if (ret)
                        return ret;
                resp.vf = &sbi->bdev;
diff --git a/lib/super.c b/lib/super.c
index a4837e5702ed..0180087e184e 100644
--- a/lib/super.c
+++ b/lib/super.c
@@ -445,9 +445,9 @@ int erofs_mkfs_format_fs(struct erofs_sb_info *sbi, 
unsigned int blkszbits,
        sbi->bmgr = bmgr;
        bmgr->dsunit = dsunit;
        if (metazone)
-               sbi->meta_blkaddr = EROFS_META_NEW_ADDR;
+               sbi->metazone_startblk = EROFS_META_NEW_ADDR;
        else
-               sbi->meta_blkaddr = 0;
+               sbi->metazone_startblk = 0;
        bh = erofs_reserve_sb(bmgr);
        if (IS_ERR(bh))
                return PTR_ERR(bh);
diff --git a/lib/tar.c b/lib/tar.c
index d5095169f9ba..1f3092566bd9 100644
--- a/lib/tar.c
+++ b/lib/tar.c
@@ -632,7 +632,7 @@ static int tarerofs_write_uncompressed_file(struct 
erofs_inode *inode,
        inode->datalayout = EROFS_INODE_FLAT_PLAIN;
        nblocks = DIV_ROUND_UP(inode->i_size, 1U << sbi->blkszbits);
 
-       ret = erofs_allocate_inode_bh_data(inode, nblocks);
+       ret = erofs_allocate_inode_bh_data(inode, nblocks, false);
        if (ret)
                return ret;
 
diff --git a/mkfs/main.c b/mkfs/main.c
index 620b1ed2b0c3..ffcb8cf75225 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -100,7 +100,7 @@ static struct option long_options[] = {
        {"oci", optional_argument, NULL, 534},
 #endif
        {"zD", optional_argument, NULL, 536},
-       {"ZI", optional_argument, NULL, 537},
+       {"MZ", optional_argument, NULL, 537},
        {"xattr-prefix", required_argument, NULL, 538},
        {"xattr-inode-digest", required_argument, NULL, 539},
        {0, 0, 0, 0},
@@ -178,7 +178,8 @@ static void usage(int argc, char **argv)
                "    --mkfs-time         the timestamp is applied as build time 
only\n"
                " -UX                    use a given filesystem UUID\n"
                " --zD[=<0|1>]           specify directory compression: 
0=disable [default], 1=enable\n"
-               " --ZI[=<0|1>]           specify the separate inode metadata 
zone availability: 0=disable [default], 1=enable\n"
+               " --MZ[=<0|[id]>]        put inode metadata ('i') and/or 
directory data ('d') into the separate metadata zone.\n"
+               "                        No argument enables both.  0=disable 
[default].\n"
                " --all-root             make all files owned by root\n"
 #ifdef EROFS_MT_ENABLED
                " --async-queue-limit=#  specify the maximum number of entries 
in the multi-threaded job queue\n"
@@ -1411,10 +1412,28 @@ static int mkfs_parse_options_cfg(struct 
erofs_importer_params *params,
                        }
                        break;
                case 537:
-                       if (!optarg || strcmp(optarg, "1"))
+                       if (!optarg) {
                                mkfscfg.inode_metazone = true;
-                       else
+                               params->dirdata_in_metazone = true;
+                       } else if (!strcmp(optarg, "0")) {
                                mkfscfg.inode_metazone = false;
+                               params->dirdata_in_metazone = false;
+                       } else {
+                               for (i = 0; optarg[i]; ++i) {
+                                       if (optarg[i] == 'i') {
+                                               mkfscfg.inode_metazone = true;
+                                       } else if (optarg[i] == 'd') {
+                                               params->dirdata_in_metazone = 
true;
+                                       } else {
+                                               erofs_err("invalid metazone 
flags `%s`", optarg);
+                                               return -EINVAL;
+                                       }
+                               }
+                               if (params->dirdata_in_metazone && 
!mkfscfg.inode_metazone) {
+                                       erofs_err("inode metadata must be in 
the metadata zone if directory data is stored there");
+                                       return -EINVAL;
+                               }
+                       }
                        break;
                case 538:
                        errno = 0;
-- 
2.43.5


Reply via email to