Hi Nithurshen,

On Sat, May 23, 2026 at 06:07:56AM +0530, Nithurshen wrote:
> Currently, fsck.erofs extracts files synchronously. When decompressing
> heavily packed images (like LZ4HC with 4K pclusters), the main thread
> spends the majority of its time blocked on a combination of synchronous
> vfs_write() syscalls and LZ4_decompress_safe(), bottlenecking overall
> extraction speed.
> 
> This patch introduces a scalable, multi-threaded decompression framework
> using the existing erofs_workqueue infrastructure to decouple compute
> from the main thread's I/O.
> 
> To prevent massive scheduling overhead (futex contention) where worker
> threads spend more CPU time waking up than actually decompressing small
> 4KB clusters, this implementation introduces a batching context. The
> main thread collects an array of sequential pclusters (temporarily hard-
> capped at Z_EROFS_PCLUSTER_BATCH_SIZE = 32) before submitting a single
> erofs_work unit.
> 
> Key details of this implementation:
> - The worker pool is dynamically sized based on available system CPUs.
> - Decompression tasks take strict ownership of the raw and output
>   buffers (safely tracking memory via a `free_out` flag) to prevent
>   data races and memory leaks.
> - Output buffers are explicitly zero-initialized via calloc() to
>   prevent trailing garbage bytes from leaking into extracted files.
> - Tail-end packed fragments are processed synchronously by the main
>   thread, as their minimal overhead does not benefit from asynchronous
>   offloading.
> 
> Signed-off-by: Nithurshen <[email protected]>
> ---
>  fsck/main.c              | 234 +++++++++++++++++----------------------
>  include/erofs/internal.h |  18 ++-
>  lib/data.c               | 203 +++++++++++++++++++++++----------
>  3 files changed, 265 insertions(+), 190 deletions(-)
> 
> diff --git a/fsck/main.c b/fsck/main.c
> index 16cc627..d7810e8 100644
> --- a/fsck/main.c
> +++ b/fsck/main.c
> @@ -8,14 +8,18 @@
>  #include <time.h>
>  #include <utime.h>
>  #include <unistd.h>
> +#include <pthread.h>
>  #include <sys/stat.h>
>  #include "erofs/print.h"
>  #include "erofs/decompress.h"
>  #include "erofs/dir.h"
>  #include "erofs/xattr.h"
> +#include "erofs/workqueue.h"
>  #include "../lib/compressor.h"
>  #include "../lib/liberofs_compress.h"
>  
> +extern struct erofs_workqueue erofs_wq;
> +
>  static int erofsfsck_check_inode(erofs_nid_t pnid, erofs_nid_t nid);
>  
>  struct erofsfsck_dirstack {
> @@ -505,135 +509,95 @@ out:
>  
>  static int erofs_verify_inode_data(struct erofs_inode *inode, int outfd)
>  {
> -     struct erofs_map_blocks map = {
> -             .buf = __EROFS_BUF_INITIALIZER,
> -     };
> -     bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
> -     int ret = 0;
> -     bool compressed;
> -     erofs_off_t pos = 0;
> -     u64 pchunk_len = 0;
> -     unsigned int raw_size = 0, buffer_size = 0;
> -     char *raw = NULL, *buffer = NULL;
> -
> -     erofs_dbg("verify data chunk of nid(%llu): type(%d)",
> -               inode->nid | 0ULL, inode->datalayout);
> -
> -     compressed = erofs_inode_is_data_compressed(inode->datalayout);
> -     while (pos < inode->i_size) {
> -             unsigned int alloc_rawsize;
> -
> -             map.m_la = pos;
> -             ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
> -             if (ret)
> -                     goto out;
> -
> -             if (!compressed && map.m_llen != map.m_plen) {
> -                     erofs_err("broken chunk length m_la %" PRIu64 " m_llen 
> %" PRIu64 " m_plen %" PRIu64,
> -                               map.m_la, map.m_llen, map.m_plen);
> -                     ret = -EFSCORRUPTED;
> -                     goto out;
> -             }
> -
> -             /* the last lcluster can be divided into 3 parts */
> -             if (map.m_la + map.m_llen > inode->i_size)
> -                     map.m_llen = inode->i_size - map.m_la;
> -
> -             pchunk_len += map.m_plen;
> -             pos += map.m_llen;
> -
> -             /* should skip decomp? */
> -             if (map.m_la >= inode->i_size || !needdecode)
> -                     continue;
> +    struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER };
> +    bool needdecode = fsckcfg.check_decomp && !erofs_is_packed_inode(inode);
> +    int ret = 0;
> +    bool compressed = erofs_inode_is_data_compressed(inode->datalayout);
> +    erofs_off_t pos = 0;
> +    u64 pchunk_len = 0;
> +
> +    struct z_erofs_read_ctx ctx = {
> +        .pending_tasks = 0,
> +        .final_err = 0,
> +        .outfd = outfd,
> +             .free_out = true,
> +        .current_task = NULL
> +    };
> +    pthread_mutex_init(&ctx.lock, NULL);
> +    pthread_cond_init(&ctx.cond, NULL);

Please avoid barely used pthread interface.

For example, erofs_mutex_t is needed for erofs-utils
instead of pthread_mutex_t;

pthread_cond_init needs to be replaced by an abstract
too.

Also, I may forget to mention that, your new
implementation should be workable for both EROFS_MT_ENABLED
and !EROFS_MT_ENABLED, not only EROFS_MT_ENABLED.

> +
> +    erofs_dbg("verify data chunk of nid(%llu): type(%d)", inode->nid | 0ULL, 
> inode->datalayout);
> +
> +    while (pos < inode->i_size) {
> +        map.m_la = pos;
> +        ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
> +        if (ret) goto out;
> +
> +        if (map.m_la + map.m_llen > inode->i_size)
> +            map.m_llen = inode->i_size - map.m_la;
> +
> +        pchunk_len += map.m_plen;
> +        pos += map.m_llen;
> +
> +        if (map.m_la >= inode->i_size || !needdecode)
> +            continue;
> +
> +        if (outfd >= 0 && !(map.m_flags & EROFS_MAP_MAPPED)) {
> +            ret = lseek(outfd, map.m_llen, SEEK_CUR);
> +            if (ret < 0) {
> +                ret = -errno;
> +                goto out;
> +            }
> +            continue;
> +        }
> +
> +        if (compressed) {
> +            char *raw = malloc(map.m_plen);
> +            size_t buffer_size = map.m_llen > erofs_blksiz(inode->sbi) ? 
> map.m_llen : erofs_blksiz(inode->sbi);
> +            char *buffer = calloc(1, buffer_size);
> +            
> +            if (!raw || !buffer) {
> +                free(raw); free(buffer);
> +                ret = -ENOMEM;
> +                goto out;
> +            }
> +
> +            ret = z_erofs_read_one_data(inode, &map, raw, buffer, 0, 
> map.m_llen, false, map.m_la, &ctx);
> +            if (ret) {
> +                /* DO NOT free(raw) or free(buffer) here. 
> z_erofs_read_one_data took ownership! */
> +                goto out;
> +            }
> +        } else {
> +            char *raw = calloc(1, map.m_llen);
> +            ret = erofs_read_one_data(inode, &map, raw, 0, map.m_llen);
> +            if (ret >= 0 && outfd >= 0)
> +                pwrite(outfd, raw, map.m_llen, map.m_la);
> +            free(raw);
> +            if (ret) goto out;
> +        }

I think the file read should be multithreaded too, especially the inode
could be large.

Thanks,
Gao Xiang

Reply via email to