perf on fsck.erofs reports that z_erofs_load_compact_lcluster was spending 20% of its time doing the div instruction. While the function itself is ~40% of the fsck.erofs runtime. In the source code, it seems that the compiler can't optimize the division by vcnt despite it only holding powers of two.
Running a benchmark on a lzma compressed freebsd source tree on x86 yields a ~3% increase in performance. The following test was run locally on an x86 machine. $ hyperfine -w 10 -p "echo 3 > /proc/sys/vm/drop_caches; sleep 1" \ "./fsck.erofs ./bsd.erofs.lzma" With shift optimization Time (mean ± σ): 360.0 ms ± 12.0 ms \ [User: 236.3 ms, System: 120.6 ms] Range (min … max): 342.3 ms … 379.8 ms 10 runs Original Dev Branch Time (mean ± σ): 371.1 ms ± 16.1 ms \ [User: 254.8 ms, System: 115.0 ms] Range (min … max): 354.8 ms … 404.4 ms 10 runs Signed-off-by: Ashley Lee <[email protected]> --- lib/zmap.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/zmap.c b/lib/zmap.c index baec278..1ba52b5 100644 --- a/lib/zmap.c +++ b/lib/zmap.c @@ -112,7 +112,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, const unsigned int lclusterbits = vi->z_lclusterbits; const unsigned int totalidx = BLK_ROUND_UP(sbi, vi->i_size); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; - unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; + unsigned int vcnt, vdiv, lo, lobits, encodebits, nblk, bytes; bool big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1; erofs_off_t pos; u8 *in, type; @@ -144,13 +144,16 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, pos += lcn * (1 << amortizedshift); /* figure out the lcluster count in this pack */ - if (1 << amortizedshift == 4 && lclusterbits <= 14) + if (1 << amortizedshift == 4 && lclusterbits <= 14) { vcnt = 2; - else if (1 << amortizedshift == 2 && lclusterbits <= 12) + vdiv = 1; + } else if (1 << amortizedshift == 2 && lclusterbits <= 12) { vcnt = 16; - else + vdiv = 4; + } else { return -EOPNOTSUPP; - + } + in = erofs_read_metabuf(&m->map->buf, sbi, pos, erofs_inode_in_metabox(vi)); if (IS_ERR(in)) @@ -160,7 +163,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, m->nextpackoff = round_down(pos, vcnt << amortizedshift) + (vcnt << amortizedshift); lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U); - encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; + encodebits = (((vcnt << amortizedshift) - sizeof(__le32)) * 8) >> vdiv; bytes = pos & ((vcnt << amortizedshift) - 1); in -= bytes; i = bytes >> amortizedshift; -- 2.53.0
