The --ext # option is used to let the user specify top# file extensions with higher occurrence, instead of hardcode some file extensions ahead.
Signed-off-by: Qi Wang <[email protected]> --- dump/main.c | 131 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 30 deletions(-) diff --git a/dump/main.c b/dump/main.c index 49ff2b7..188ab34 100644 --- a/dump/main.c +++ b/dump/main.c @@ -6,6 +6,7 @@ * Guo Xuenan <[email protected]> */ #define _GNU_SOURCE +#include <string.h> #include <stdlib.h> #include <getopt.h> #include <time.h> @@ -15,6 +16,7 @@ #include "erofs/io.h" #include "erofs/dir.h" #include "../lib/liberofs_private.h" +#include "erofs/hashmap.h" #ifdef HAVE_LIBUUID #include <uuid.h> @@ -29,17 +31,34 @@ struct erofsdump_cfg { bool show_subdirectories; erofs_nid_t nid; const char *inode_path; + unsigned int show_ext_count; }; static struct erofsdump_cfg dumpcfg; static const char chart_format[] = "%-16s %-11d %8.2f%% |%-50s|\n"; static const char header_format[] = "%-16s %11s %16s |%-50s|\n"; -static char *file_types[] = { - ".txt", ".so", ".xml", ".apk", - ".odex", ".vdex", ".oat", ".rc", - ".otf", ".txt", "others", + +struct postfix_statistics { + struct hashmap_entry ent; + char postfix[16]; + unsigned int count; + unsigned long occupied_size; + unsigned long original_size; }; -#define OTHERFILETYPE ARRAY_SIZE(file_types) + +static int erofs_postfix_hashmap_cmp(const void *a, const void *b, + const void *key) +{ + const struct postfix_statistics *ps1 = + container_of((struct hashmap_entry *)a, + struct postfix_statistics, ent); + const struct postfix_statistics *ps2 = + container_of((struct hashmap_entry *)b, + struct postfix_statistics, ent); + + return strncmp(ps1->postfix, key ? key : ps2->postfix, sizeof(ps1->postfix)); +} + /* (1 << FILE_MAX_SIZE_BITS)KB */ #define FILE_MAX_SIZE_BITS 16 @@ -65,7 +84,7 @@ struct erofs_statistics { /* [statistics] # of files based on inode_info->flags */ unsigned long file_category_stat[EROFS_FT_MAX]; /* [statistics] # of files based on file name extensions */ - unsigned int file_type_stat[OTHERFILETYPE]; + struct hashmap postfix_hashmap; /* [statistics] # of files based on the original size of files */ unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1]; /* [statistics] # of files based on the compressed size of files */ @@ -79,6 +98,7 @@ static struct option long_options[] = { {"device", required_argument, NULL, 3}, {"path", required_argument, NULL, 4}, {"ls", no_argument, NULL, 5}, + {"ext", required_argument, NULL, 6}, {0, 0, 0, 0}, }; @@ -111,6 +131,7 @@ static void usage(void) " --ls show directory contents (INODE required)\n" " --nid=# show the target inode info of nid #\n" " --path=X show the target inode info of path X\n" + " --ext=# show the top # extension file info\n" " --help display this help and exit.\n", stderr); } @@ -164,6 +185,9 @@ static int erofsdump_parse_options_cfg(int argc, char **argv) case 5: dumpcfg.show_subdirectories = true; break; + case 6: + dumpcfg.show_ext_count = atoi(optarg); + break; default: return -EINVAL; } @@ -208,20 +232,41 @@ static int erofsdump_get_occupied_size(struct erofs_inode *inode, return 0; } -static void inc_file_extension_count(const char *dname, unsigned int len) +static void inc_file_extension_count(const char *dname, unsigned int len, + unsigned long occupied_size, unsigned long original_size) { char *postfix = memrchr(dname, '.', len); - int type; + unsigned int hash, plen; + struct postfix_statistics *ps; + char pf[sizeof(ps->postfix)] = {0}; + + plen = len - (postfix - dname); + if (plen > sizeof(ps->postfix)) + plen = sizeof(ps->postfix); + if (postfix) { + memcpy(pf, postfix, plen); + hash = strhash(pf); + ps = hashmap_get_from_hash(&stats.postfix_hashmap, hash, pf); + if (ps) { + ps->count++; + ps->occupied_size += occupied_size; + ps->original_size += original_size; + return; + } + ps = malloc(sizeof(struct postfix_statistics)); + if (!ps) { + erofs_err("memory allocation failed!"); + return; + } - if (!postfix) { - type = OTHERFILETYPE - 1; - } else { - for (type = 0; type < OTHERFILETYPE - 1; ++type) - if (!strncmp(postfix, file_types[type], - len - (postfix - dname))) - break; + ps->count = 1; + ps->occupied_size = occupied_size; + ps->original_size = original_size; + memset(ps->postfix, 0, sizeof(ps->postfix)); + strncpy(ps->postfix, pf, plen); + hashmap_entry_init(&ps->ent, hash); + hashmap_add(&stats.postfix_hashmap, ps); } - ++stats.file_type_stat[type]; } static void update_file_size_statatics(erofs_off_t occupied_size, @@ -298,7 +343,7 @@ static int erofsdump_readdir(struct erofs_dir_context *ctx) if (S_ISREG(vi.i_mode)) { stats.files_total_origin_size += vi.i_size; - inc_file_extension_count(ctx->dname, ctx->de_namelen); + inc_file_extension_count(ctx->dname, ctx->de_namelen, occupied_size, vi.i_size); stats.files_total_size += occupied_size; update_file_size_statatics(occupied_size, vi.i_size); } @@ -481,27 +526,50 @@ static void erofsdump_filesize_distribution(const char *title, } } -static void erofsdump_filetype_distribution(char **file_types, unsigned int len) +static int comp_postfix_statistics(const void *a, const void *b) +{ + const struct postfix_statistics *psa, *psb; + + psa = a; + psb = b; + return psa->count < psb->count ? 1 : + (psa->count > psb->count) ? -1 : 0; +} + +static void erofsdump_filetype_distribution(int topk) { char col1[30]; - unsigned int col2, i; - double col3; + unsigned int col2, i, pos; + double col3, compression_rate; char col4[401]; - + struct postfix_statistics *ps_array; + struct postfix_statistics *ps; + struct hashmap_iter iter; + + pos = 0; + ps_array = malloc(sizeof(struct postfix_statistics) * stats.postfix_hashmap.size); + hashmap_iter_init(&stats.postfix_hashmap, &iter); + while ((ps = hashmap_iter_next(&iter))) + ps_array[pos++] = *ps; + + DBG_BUGON(pos != stats.postfix_hashmap.size); + qsort(ps_array, pos, sizeof(struct postfix_statistics), + comp_postfix_statistics); fprintf(stdout, "\nFile type distribution:\n"); fprintf(stdout, header_format, "type", "count", "ratio", - "distribution"); - for (i = 0; i < len; i++) { + "compression rate"); + for (i = 0; i < topk && i < pos; i++) { memset(col1, 0, sizeof(col1)); - memset(col4, 0, sizeof(col4)); - sprintf(col1, "%-17s", file_types[i]); - col2 = stats.file_type_stat[i]; + sprintf(col1, "%-17s", ps_array[i].postfix); + col2 = ps_array[i].count; if (stats.file_category_stat[EROFS_FT_REG_FILE]) col3 = (double)(100 * col2) / stats.file_category_stat[EROFS_FT_REG_FILE]; else col3 = 0.0; - memset(col4, '#', col3 / 2); + compression_rate = 100.0 * (double)ps_array[i].occupied_size / + (double)ps_array[i].original_size; + sprintf(col4, "%.2f%%", compression_rate); fprintf(stdout, chart_format, col1, col2, col3, col4); } } @@ -543,6 +611,7 @@ static void erofsdump_print_statistic(void) .de_namelen = 0, }; + hashmap_init(&stats.postfix_hashmap, erofs_postfix_hashmap_cmp, 0); err = erofsdump_readdir(&ctx); if (err) { erofs_err("read dir failed"); @@ -555,7 +624,7 @@ static void erofsdump_print_statistic(void) erofsdump_filesize_distribution("On-disk", stats.file_comp_size, ARRAY_SIZE(stats.file_comp_size)); - erofsdump_filetype_distribution(file_types, OTHERFILETYPE); + erofsdump_filetype_distribution(dumpcfg.show_ext_count); } static void erofsdump_show_superblock(void) @@ -624,9 +693,11 @@ int main(int argc, char **argv) if (dumpcfg.show_superblock) erofsdump_show_superblock(); - if (dumpcfg.show_statistics) + if (dumpcfg.show_statistics) { + if (dumpcfg.show_ext_count == 0) + dumpcfg.show_ext_count = 10; erofsdump_print_statistic(); - + } if (dumpcfg.show_extent && !dumpcfg.show_inode) { usage(); goto exit_dev_close; -- 2.30.2
