Hi Nithurshen,
On Sat, May 23, 2026 at 06:07:56AM +0530, Nithurshen wrote:
> Currently, fsck.erofs extracts files synchronously. When decompressing
> heavily packed images (like LZ4HC with 4K pclusters), the main thread
> spends the majority of its time blocked on a combination of synchronous
> vfs_write() syscalls and LZ4_decompress_safe(), bottlenecking overall
> extraction speed.
>
> This patch introduces a scalable, multi-threaded decompression framework
> using the existing erofs_workqueue infrastructure to decouple compute
> from the main thread's I/O.
>
> To prevent massive scheduling overhead (futex contention) where worker
> threads spend more CPU time waking up than actually decompressing small
> 4KB clusters, this implementation introduces a batching context. The
> main thread collects an array of sequential pclusters (temporarily hard-
> capped at Z_EROFS_PCLUSTER_BATCH_SIZE = 32) before submitting a single
> erofs_work unit.
>
> Key details of this implementation:
> - The worker pool is dynamically sized based on available system CPUs.
> - Decompression tasks take strict ownership of the raw and output
> buffers (safely tracking memory via a `free_out` flag) to prevent
> data races and memory leaks.
> - Output buffers are explicitly zero-initialized via calloc() to
> prevent trailing garbage bytes from leaking into extracted files.
> - Tail-end packed fragments are processed synchronously by the main
> thread, as their minimal overhead does not benefit from asynchronous
> offloading.
>
> Signed-off-by: Nithurshen <[email protected]>
> ---
> fsck/main.c | 234 +++++++++++++++++----------------------
> include/erofs/internal.h | 18 ++-
> lib/data.c | 203 +++++++++++++++++++++++----------
> 3 files changed, 265 insertions(+), 190 deletions(-)
>
> diff --git a/fsck/main.c b/fsck/main.c
> index 16cc627..d7810e8 100644
> --- a/fsck/main.c
> +++ b/fsck/main.c
> @@ -8,14 +8,18 @@
> #include <time.h>
> #include <utime.h>
> #include <unistd.h>
> +#include <pthread.h>
> #include <sys/stat.h>
> #include "erofs/print.h"
> #include "erofs/decompress.h"
> #include "erofs/dir.h"
> #include "erofs/xattr.h"
> +#include "erofs/workqueue.h"
> #include "../lib/compressor.h"
> #include "../lib/liberofs_compress.h"
>
> +extern struct erofs_workqueue erofs_wq;
> +
> static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid);
>
> struct erofsfsck_dirstack {
> @@ -505,135 +509,95 @@ out:
>
> static int erofs_verify_inode_data(struct erofs_inode *inode, int outfd)
> {
> - struct erofs_map_blocks map = {
> - .buf = __EROFS_BUF_INITIALIZER,
> - };
> - bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
> - int ret = 0;
> - bool compressed;
> - erofs_off_t pos = 0;
> - u64 pchunk_len = 0;
> - unsigned int raw_size = 0, buffer_size = 0;
> - char *raw = NULL, *buffer = NULL;
> -
> - erofs_dbg("verify data chunk of nid(%llu): type(%d)",
> - inode->nid | 0ULL, inode->datalayout);
> -
> - compressed = erofs_inode_is_data_compressed(inode->datalayout);
> - while (pos < inode->i_size) {
> - unsigned int alloc_rawsize;
> -
> - map.m_la = pos;
> - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
> - if (ret)
> - goto out;
> -
> - if (!compressed && map.m_llen != map.m_plen) {
> - erofs_err("broken chunk length m_la %" PRIu64 " m_llen
> %" PRIu64 " m_plen %" PRIu64,
> - map.m_la, map.m_llen, map.m_plen);
> - ret = -EFSCORRUPTED;
> - goto out;
> - }
> -
> - /* the last lcluster can be divided into 3 parts */
> - if (map.m_la + map.m_llen > inode->i_size)
> - map.m_llen = inode->i_size - map.m_la;
> -
> - pchunk_len += map.m_plen;
> - pos += map.m_llen;
> -
> - /* should skip decomp? */
> - if (map.m_la >= inode->i_size || !needdecode)
> - continue;
> + struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER };
> + bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
> + int ret = 0;
> + bool compressed = erofs_inode_is_data_compressed(inode->datalayout);
> + erofs_off_t pos = 0;
> + u64 pchunk_len = 0;
> +
> + struct z_erofs_read_ctx ctx = {
> + .pending_tasks = 0,
> + .final_err = 0,
> + .outfd = outfd,
> + .free_out = true,
> + .current_task = NULL
> + };
> + pthread_mutex_init(&ctx.lock, NULL);
> + pthread_cond_init(&ctx.cond, NULL);
Please avoid barely used pthread interface.
For example, erofs_mutex_t is needed for erofs-utils
instead of pthread_mutex_t;
pthread_cond_init needs to be replaced by an abstract
too.
Also, I may forget to mention that, your new
implementation should be workable for both EROFS_MT_ENABLED
and !EROFS_MT_ENABLED, not only EROFS_MT_ENABLED.
> +
> + erofs_dbg("verify data chunk of nid(%llu): type(%d)", inode->nid | 0ULL,
> inode->datalayout);
> +
> + while (pos < inode->i_size) {
> + map.m_la = pos;
> + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
> + if (ret) goto out;
> +
> + if (map.m_la + map.m_llen > inode->i_size)
> + map.m_llen = inode->i_size - map.m_la;
> +
> + pchunk_len += map.m_plen;
> + pos += map.m_llen;
> +
> + if (map.m_la >= inode->i_size || !needdecode)
> + continue;
> +
> + if (outfd >= 0 && !(map.m_flags & EROFS_MAP_MAPPED)) {
> + ret = lseek(outfd, map.m_llen, SEEK_CUR);
> + if (ret < 0) {
> + ret = -errno;
> + goto out;
> + }
> + continue;
> + }
> +
> + if (compressed) {
> + char *raw = malloc(map.m_plen);
> + size_t buffer_size = map.m_llen > erofs_blksiz(inode->sbi) ?
> map.m_llen : erofs_blksiz(inode->sbi);
> + char *buffer = calloc(1, buffer_size);
> +
> + if (!raw || !buffer) {
> + free(raw); free(buffer);
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + ret = z_erofs_read_one_data(inode, &map, raw, buffer, 0,
> map.m_llen, false, map.m_la, &ctx);
> + if (ret) {
> + /* DO NOT free(raw) or free(buffer) here.
> z_erofs_read_one_data took ownership! */
> + goto out;
> + }
> + } else {
> + char *raw = calloc(1, map.m_llen);
> + ret = erofs_read_one_data(inode, &map, raw, 0, map.m_llen);
> + if (ret >= 0 && outfd >= 0)
> + pwrite(outfd, raw, map.m_llen, map.m_la);
> + free(raw);
> + if (ret) goto out;
> + }
I think the file read should be multithreaded too, especially the inode
could be large.
Thanks,
Gao Xiang