From: Yue Hu <[email protected]> Previously, there's no fragment deduplication when this feature is introduced. Let's support it now.
We intend to dedupe the fragments before compression, so that duplicate parts will not be written into packed inode. I have tested some types with dataset of Linux 5.10.1 + 5.10.87 source code. The results of image size in MiB are: 32k pcluster + lz4hc,12 + T0 + fragment 311 64k pcluster + lz4hc,12 + T0 + fragment 295 128k pcluster + lz4hc,12 + T0 + fragment 287 32k pcluster + lz4hc,12 + T0 + fragment + dedupe 286 64k pcluster + lz4hc,12 + T0 + fragment + dedupe 281 128k pcluster + lz4hc,12 + T0 + fragment + dedupe 278 Before the patch, they were: 32k pcluster + lz4hc,12 + T0 + fragment 450 64k pcluster + lz4hc,12 + T0 + fragment 434 128k pcluster + lz4hc,12 + T0 + fragment 426 32k pcluster + lz4hc,12 + T0 + fragment + dedupe 368 64k pcluster + lz4hc,12 + T0 + fragment + dedupe 380 128k pcluster + lz4hc,12 + T0 + fragment + dedupe 395 Test results on squashfs (which uses level 12 by default for lz4hc): 32k block + lz4hc 332 64k block + lz4hc 304 128k block + lz4hc 283 256k block + lz4hc 273 256k block + lz4hc + noI 278 Suggested-by: Gao Xiang <[email protected]> Signed-off-by: Yue Hu <[email protected]> --- v5: - fix the issue that decompression may fail when ztailpacking is enabled as well - fix the issue that decompression may fail when dedupe is enabled as well in v4 - minor commit message update v4: - renaming include tofcrc/new_fragmentsize - move fixup into ctx - use may_fixing to check packing fragment or not - move sb/inode flag + 64bits case from erofs_pack_fragments() to new helper erofs_fragments_commit() - move recompress ahead of may_inline case when compressing succeeds - update commit message/code comments - change to generate a ctx for duplicate fragment in compression - note that decompress will fail when enable ztailpacking at the same time, need some time to debug v3: - modify acrroding to Xiang's comments in v2 - simplify the logic in vle_compress_one - fix the crash for 1MB pcluster v2: mainly improve the logic in compression include/erofs/fragments.h | 4 +- lib/compress.c | 140 ++++++++++++++++++++++----- lib/fragments.c | 197 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 309 insertions(+), 32 deletions(-) diff --git a/include/erofs/fragments.h b/include/erofs/fragments.h index 5444384..e91fb31 100644 --- a/include/erofs/fragments.h +++ b/include/erofs/fragments.h @@ -15,8 +15,10 @@ extern "C" extern const char *frags_packedname; #define EROFS_PACKED_INODE frags_packedname +int z_erofs_fragments_dedupe(struct erofs_inode *inode, int fd, u32 *tofcrc_r); int z_erofs_pack_fragments(struct erofs_inode *inode, void *data, - unsigned int len); + unsigned int len, u32 tofcrc); +void z_erofs_fragments_commit(struct erofs_inode *inode); struct erofs_inode *erofs_mkfs_build_fragments(void); int erofs_fragments_init(void); void erofs_fragments_exit(void); diff --git a/lib/compress.c b/lib/compress.c index 8f4c63a..5f587e9 100644 --- a/lib/compress.c +++ b/lib/compress.c @@ -33,6 +33,11 @@ struct z_erofs_vle_compress_ctx { unsigned int head, tail; erofs_blk_t blkaddr; /* pointing to the next blkaddr */ u16 clusterofs; + + u32 tofcrc; + unsigned int pclustersize; + erofs_off_t remaining; + bool need_fix; }; #define Z_EROFS_LEGACY_MAP_HEADER_SIZE \ @@ -162,10 +167,10 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx) ctx->clusterofs = clusterofs + count; } -static int z_erofs_compress_dedupe(struct erofs_inode *inode, - struct z_erofs_vle_compress_ctx *ctx, +static int z_erofs_compress_dedupe(struct z_erofs_vle_compress_ctx *ctx, unsigned int *len) { + struct erofs_inode *inode = ctx->inode; int ret = 0; do { @@ -311,10 +316,6 @@ static void tryrecompress_trailing(void *in, unsigned int *insize, unsigned int count; int ret = *compressedsize; - /* no need to recompress */ - if (!(ret & (EROFS_BLKSIZ - 1))) - return; - count = *insize; ret = erofs_compress_destsize(&compresshandle, in, &count, (void *)tmp, @@ -329,30 +330,71 @@ static void tryrecompress_trailing(void *in, unsigned int *insize, *compressedsize = ret; } -static int vle_compress_one(struct z_erofs_vle_compress_ctx *ctx, - bool final) +static void z_erofs_fragments_dedupe_update(struct z_erofs_vle_compress_ctx *ctx, + unsigned int *len) +{ + struct erofs_inode *inode = ctx->inode; + const unsigned int new_fragmentsize = ctx->remaining + *len; + + DBG_BUGON(!inode->fragment_size); + + /* try to fix it again if it gets larger (should be rare) */ + if (inode->fragment_size < new_fragmentsize) { + ctx->pclustersize = + roundup(new_fragmentsize - inode->fragment_size, + EROFS_BLKSIZ); + return; + } + + inode->fragmentoff += inode->fragment_size - new_fragmentsize; + inode->fragment_size = new_fragmentsize; + + erofs_dbg("Reducing fragment size to %u at %lu", + inode->fragment_size, inode->fragmentoff); + + /* it's ending */ + ctx->head += new_fragmentsize; + ctx->remaining = 0; + *len = 0; +} + +static int vle_compress_one(struct z_erofs_vle_compress_ctx *ctx) { static char dstbuf[EROFS_CONFIG_COMPR_MAX_SZ + EROFS_BLKSIZ]; struct erofs_inode *inode = ctx->inode; char *const dst = dstbuf + EROFS_BLKSIZ; struct erofs_compress *const h = &compresshandle; unsigned int len = ctx->tail - ctx->head; + bool final = !ctx->remaining; int ret; while (len) { - unsigned int pclustersize = - z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ; - bool may_inline = (cfg.c_ztailpacking && final); + bool may_inline = (cfg.c_ztailpacking && final && + !inode->fragment_size); bool may_packing = (cfg.c_fragments && final && !erofs_is_packed_inode(inode)); + bool may_fixing = ctx->need_fix; - if (z_erofs_compress_dedupe(inode, ctx, &len) && !final) + if (z_erofs_compress_dedupe(ctx, &len) && !final) break; - if (len <= pclustersize) { - if (!final || !len) + if (len <= ctx->pclustersize) { + if (!final) break; + if (!len) { + if (!inode->fragment_size) + break; + goto duplicate_frag; + } if (may_packing) { + if (inode->fragment_size && !may_fixing) { + ctx->remaining = inode->fragment_size; + ctx->pclustersize = + roundup(len, EROFS_BLKSIZ); + ctx->e.length = 0; + ctx->need_fix = true; + return 0; + } ctx->e.length = len; goto frag_packing; } @@ -363,7 +405,7 @@ static int vle_compress_one(struct z_erofs_vle_compress_ctx *ctx, ctx->e.length = min(len, cfg.c_max_decompressed_extent_bytes); ret = erofs_compress_destsize(h, ctx->queue + ctx->head, - &ctx->e.length, dst, pclustersize, + &ctx->e.length, dst, ctx->pclustersize, !(final && len == ctx->e.length)); if (ret <= 0) { if (ret != -EAGAIN) { @@ -395,15 +437,17 @@ nocompression: ctx->e.compressedblks = 1; ctx->e.raw = true; } else if (may_packing && len == ctx->e.length && - ret < pclustersize) { + ret < ctx->pclustersize && (!inode->fragment_size || + may_fixing)) { frag_packing: ret = z_erofs_pack_fragments(inode, ctx->queue + ctx->head, - len); + len, ctx->tofcrc); if (ret < 0) return ret; ctx->e.compressedblks = 0; /* indicate a fragment */ ctx->e.raw = true; + may_fixing = false; /* tailpcluster should be less than 1 block */ } else if (may_inline && len == ctx->e.length && ret < EROFS_BLKSIZ) { @@ -425,7 +469,27 @@ frag_packing: } else { unsigned int tailused, padding; - if (may_inline && len == ctx->e.length) + tailused = ret & (EROFS_BLKSIZ - 1); + /* + * If there's a space left for the last round when + * deduping fragments, try to read fragment and + * recompress a litte more to check whether it can be + * filled up. Fix the fragment if succeeds. Otherwise, + * just drop it and go to packing. + */ + if (may_packing && len == ctx->e.length && tailused && + ctx->tail < sizeof(ctx->queue)) { + DBG_BUGON(!inode->fragment_size); + + ctx->remaining = inode->fragment_size; + ctx->pclustersize = + BLK_ROUND_UP(ret) * EROFS_BLKSIZ; + ctx->e.length = 0; + ctx->need_fix = true; + return 0; + } + + if (may_inline && len == ctx->e.length && tailused) tryrecompress_trailing(ctx->queue + ctx->head, &ctx->e.length, dst, &ret); @@ -464,6 +528,21 @@ frag_packing: ctx->head += ctx->e.length; len -= ctx->e.length; + if (may_fixing) + z_erofs_fragments_dedupe_update(ctx, &len); + + /* generate a ctx for duplicate fragment */ + if (final && !len && inode->fragment_size && !may_packing) { +duplicate_frag: + z_erofs_write_indexes(ctx); + + ctx->e.length = inode->fragment_size; + ctx->e.compressedblks = 0; + ctx->e.raw = true; + ctx->e.partial = false; + ctx->e.blkaddr = ctx->blkaddr; + } + if (!final && ctx->head >= EROFS_CONFIG_COMPR_MAX_SZ) { const unsigned int qh_aligned = round_down(ctx->head, EROFS_BLKSIZ); @@ -746,7 +825,6 @@ int erofs_write_compressed_file(struct erofs_inode *inode, int fd) { struct erofs_buffer_head *bh; static struct z_erofs_vle_compress_ctx ctx; - erofs_off_t remaining; erofs_blk_t blkaddr, compressed_blocks; unsigned int legacymetasize; int ret; @@ -785,6 +863,16 @@ int erofs_write_compressed_file(struct erofs_inode *inode, int fd) inode->idata_size = 0; inode->fragment_size = 0; + /* + * Dedupe fragments before compression to avoid writing duplicate parts + * into packed inode. + */ + if (cfg.c_fragments && !erofs_is_packed_inode(inode)) { + ret = z_erofs_fragments_dedupe(inode, fd, &ctx.tofcrc); + if (ret < 0) + goto err_bdrop; + } + blkaddr = erofs_mapbh(bh->block); /* start_blkaddr */ ctx.inode = inode; ctx.blkaddr = blkaddr; @@ -792,10 +880,12 @@ int erofs_write_compressed_file(struct erofs_inode *inode, int fd) ctx.head = ctx.tail = 0; ctx.clusterofs = 0; ctx.e.length = 0; - remaining = inode->i_size; + ctx.pclustersize = z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ; + ctx.remaining = inode->i_size - inode->fragment_size; + ctx.need_fix = false; - while (remaining) { - const u64 readcount = min_t(u64, remaining, + while (ctx.remaining) { + const u64 readcount = min_t(u64, ctx.remaining, sizeof(ctx.queue) - ctx.tail); ret = read(fd, ctx.queue + ctx.tail, readcount); @@ -803,10 +893,10 @@ int erofs_write_compressed_file(struct erofs_inode *inode, int fd) ret = -errno; goto err_bdrop; } - remaining -= readcount; + ctx.remaining -= readcount; ctx.tail += readcount; - ret = vle_compress_one(&ctx, !remaining); + ret = vle_compress_one(&ctx); if (ret) goto err_free_idata; } @@ -817,6 +907,8 @@ int erofs_write_compressed_file(struct erofs_inode *inode, int fd) DBG_BUGON(compressed_blocks < !!inode->idata_size); compressed_blocks -= !!inode->idata_size; + z_erofs_fragments_commit(inode); + z_erofs_write_indexes(&ctx); z_erofs_write_indexes_final(&ctx); legacymetasize = ctx.metacur - compressmeta; diff --git a/lib/fragments.c b/lib/fragments.c index b8c37d5..e50937c 100644 --- a/lib/fragments.c +++ b/lib/fragments.c @@ -10,17 +10,181 @@ #include "erofs/inode.h" #include "erofs/compress.h" #include "erofs/print.h" +#include "erofs/internal.h" #include "erofs/fragments.h" +struct erofs_fragment_dedupe_item { + struct list_head list; + unsigned int length, nr_dup; + erofs_off_t pos; + u8 data[]; +}; + +#define FRAGMENT_HASHTABLE_SIZE 65536 +#define FRAGMENT_HASH(crc) (crc & (FRAGMENT_HASHTABLE_SIZE - 1)) + +static struct list_head dupli_frags[FRAGMENT_HASHTABLE_SIZE]; +static unsigned int len_to_hash; /* the fragment length for crc32 hash */ + static FILE *packedfile; const char *frags_packedname = "packed_file"; -int z_erofs_pack_fragments(struct erofs_inode *inode, void *data, - unsigned int len) +static int z_erofs_fragments_dedupe_find(struct erofs_inode *inode, int fd, + u32 crc) { - inode->z_advise |= Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - inode->fragmentoff = ftell(packedfile); - inode->fragment_size = len; + struct erofs_fragment_dedupe_item *cur, *di = NULL; + struct list_head *head; + u8 *data; + unsigned int length; + int ret; + + head = &dupli_frags[FRAGMENT_HASH(crc)]; + if (list_empty(head)) + return 0; + + /* XXX: no need to read so much for smaller? */ + if (inode->i_size < EROFS_CONFIG_COMPR_MAX_SZ) + length = inode->i_size; + else + length = EROFS_CONFIG_COMPR_MAX_SZ; + + data = malloc(length); + if (!data) + return -ENOMEM; + + ret = lseek(fd, inode->i_size - length, SEEK_SET); + if (ret == -1) { + ret = -errno; + goto out; + } + + ret = read(fd, data, length); + if (ret != length) { + ret = -errno; + goto out; + } + + list_for_each_entry(cur, head, list) { + unsigned int e1, e2, i = 0; + + DBG_BUGON(cur->length < len_to_hash + 1); + DBG_BUGON(length < len_to_hash + 1); + + e1 = cur->length - len_to_hash - 1; + e2 = length - len_to_hash - 1; + + if (memcmp(cur->data + e1 + 1, data + e2 + 1, len_to_hash)) + continue; + + while (i <= min(e1, e2) && cur->data[e1 - i] == data[e2 - i]) + i++; + + if (i && (!di || i + len_to_hash > di->nr_dup)) { + cur->nr_dup = i + len_to_hash; + di = cur; + + /* full match */ + if (i == min(e1, e2) + 1) + break; + } + } + if (!di) + goto out; + + DBG_BUGON(di->length < di->nr_dup); + + inode->fragment_size = di->nr_dup; + inode->fragmentoff = di->pos + di->length - di->nr_dup; + + erofs_dbg("Dedupe %u fragment data at %lu", inode->fragment_size, + inode->fragmentoff); +out: + free(data); + return ret; +} + +int z_erofs_fragments_dedupe(struct erofs_inode *inode, int fd, u32 *tofcrc_r) +{ + u8 data_to_hash[len_to_hash]; + u32 crc; + int ret; + + if (inode->i_size <= len_to_hash) + return 0; + + ret = lseek(fd, inode->i_size - len_to_hash, SEEK_SET); + if (ret == -1) + return -errno; + + ret = read(fd, data_to_hash, len_to_hash); + if (ret != len_to_hash) + return -errno; + + crc = erofs_crc32c(~0, data_to_hash, len_to_hash); + *tofcrc_r = crc; + + ret = z_erofs_fragments_dedupe_find(inode, fd, crc); + if (ret < 0) + return ret; + + ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) + return -errno; + return 0; +} + +static int z_erofs_fragments_dedupe_insert(void *data, unsigned int len, + erofs_off_t pos, u32 crc) +{ + struct erofs_fragment_dedupe_item *di; + + if (len <= len_to_hash) + return 0; + + di = malloc(sizeof(*di) + len); + if (!di) + return -ENOMEM; + + memcpy(di->data, data, len); + di->length = len; + di->pos = pos; + di->nr_dup = 0; + + list_add_tail(&di->list, &dupli_frags[FRAGMENT_HASH(crc)]); + return 0; +} + +static inline void z_erofs_fragments_dedupe_init(unsigned int clen) +{ + unsigned int i; + + for (i = 0; i < FRAGMENT_HASHTABLE_SIZE; ++i) + init_list_head(&dupli_frags[i]); + + len_to_hash = clen; +} + +static void z_erofs_fragments_dedupe_exit(void) +{ + struct erofs_fragment_dedupe_item *di, *n; + struct list_head *head; + unsigned int i; + + for (i = 0; i < FRAGMENT_HASHTABLE_SIZE; ++i) { + head = &dupli_frags[i]; + + if (list_empty(head)) + continue; + + list_for_each_entry_safe(di, n, head, list) + free(di); + } +} + +void z_erofs_fragments_commit(struct erofs_inode *inode) +{ + if (!inode->fragment_size) + return; /* * If the packed inode is larger than 4GiB, the full fragmentoff * will be recorded by switching to the noncompact layout anyway. @@ -28,13 +192,28 @@ int z_erofs_pack_fragments(struct erofs_inode *inode, void *data, if (inode->fragmentoff >> 32) inode->datalayout = EROFS_INODE_FLAT_COMPRESSION_LEGACY; + inode->z_advise |= Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + erofs_sb_set_fragments(); +} + +int z_erofs_pack_fragments(struct erofs_inode *inode, void *data, + unsigned int len, u32 tofcrc) +{ + int ret; + + inode->fragmentoff = ftell(packedfile); + inode->fragment_size = len; + if (fwrite(data, len, 1, packedfile) != 1) return -EIO; - erofs_sb_set_fragments(); - erofs_dbg("Recording %u fragment data at %lu", inode->fragment_size, inode->fragmentoff); + + ret = z_erofs_fragments_dedupe_insert(data, len, inode->fragmentoff, + tofcrc); + if (ret) + return ret; return len; } @@ -50,6 +229,8 @@ void erofs_fragments_exit(void) { if (packedfile) fclose(packedfile); + + z_erofs_fragments_dedupe_exit(); } int erofs_fragments_init(void) @@ -61,5 +242,7 @@ int erofs_fragments_init(void) #endif if (!packedfile) return -ENOMEM; + + z_erofs_fragments_dedupe_init(16); return 0; } -- 2.17.1
