While static batching successfully overlaps I/O and compute, different
compression algorithms exhibit vastly different scheduling thresholds.
Extremely fast algorithms like LZ4 require large batches (e.g., 32
pclusters) to effectively hide the synchronization overhead of the
thread pool.

Conversely, applying this large batch size to compute-heavy algorithms
like LZMA or ZSTD causes memory bloat and thread starvation, as the
main thread spends too much time reading and accumulating memory before
waking up the background workers.

This patch modifies the workqueue submission logic in z_erofs_read_one_data
to dynamically scale the batch size based on the algorithm format. LZ4
is permitted to utilize the Z_EROFS_PCLUSTER_MAX_BATCH_SIZE, while
other heavier algorithms trigger workqueue submission at a much lower
threshold (8 pclusters) to ensure a steady pipeline of work and a
bounded memory footprint.

Signed-off-by: Nithurshen <[email protected]>
---
 include/erofs/internal.h |  2 +-
 lib/data.c               | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 38020ee..c8f056f 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -62,7 +62,7 @@ struct erofs_buf {
 #define erofs_pos(sbi, nr)      ((erofs_off_t)(nr) << (sbi)->blkszbits)
 #define BLK_ROUND_UP(sbi, addr)        \
        (roundup(addr, erofs_blksiz(sbi)) >> (sbi)->blkszbits)
-#define Z_EROFS_PCLUSTER_BATCH_SIZE 32
+#define Z_EROFS_PCLUSTER_MAX_BATCH_SIZE 32
 
 struct erofs_buffer_head;
 struct erofs_bufmgr;
diff --git a/lib/data.c b/lib/data.c
index fa36899..a06f4c2 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -17,11 +17,11 @@ struct erofs_workqueue erofs_wq;
 struct z_erofs_decompress_task {
        struct erofs_work work;
        struct z_erofs_read_ctx *ctx;
-       struct z_erofs_decompress_req reqs[Z_EROFS_PCLUSTER_BATCH_SIZE];
-       char *raw_bufs[Z_EROFS_PCLUSTER_BATCH_SIZE];
-       char *out_bufs[Z_EROFS_PCLUSTER_BATCH_SIZE];
-       erofs_off_t out_offsets[Z_EROFS_PCLUSTER_BATCH_SIZE];
-       unsigned int out_lengths[Z_EROFS_PCLUSTER_BATCH_SIZE];
+       struct z_erofs_decompress_req reqs[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
+       char *raw_bufs[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
+       char *out_bufs[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
+       erofs_off_t out_offsets[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
+       unsigned int out_lengths[Z_EROFS_PCLUSTER_MAX_BATCH_SIZE];
        unsigned int nr_reqs;
 };
 
@@ -397,7 +397,10 @@ int z_erofs_read_one_data(struct erofs_inode *inode,
        task->out_offsets[idx] = out_offset;
        task->out_lengths[idx] = length;
 
-       if (task->nr_reqs == Z_EROFS_PCLUSTER_BATCH_SIZE) {
+       int batch_limit = (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZ4) ? 
+                                               Z_EROFS_PCLUSTER_MAX_BATCH_SIZE 
: 8;
+
+       if (task->nr_reqs >= batch_limit) {
                z_erofs_read_ctx_enqueue(ctx);
        }
        return 0;
-- 
2.52.0


Reply via email to