The --ext # option is used to let the user specify top# file extensions
with higher occurrence, instead of hardcode some file extensions ahead.

Signed-off-by: Qi Wang <[email protected]>
---
 dump/main.c | 131 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 30 deletions(-)

diff --git a/dump/main.c b/dump/main.c
index 49ff2b7..188ab34 100644
--- a/dump/main.c
+++ b/dump/main.c
@@ -6,6 +6,7 @@
  *            Guo Xuenan <[email protected]>
  */
 #define _GNU_SOURCE
+#include <string.h>
 #include <stdlib.h>
 #include <getopt.h>
 #include <time.h>
@@ -15,6 +16,7 @@
 #include "erofs/io.h"
 #include "erofs/dir.h"
 #include "../lib/liberofs_private.h"
+#include "erofs/hashmap.h"
 
 #ifdef HAVE_LIBUUID
 #include <uuid.h>
@@ -29,17 +31,34 @@ struct erofsdump_cfg {
        bool show_subdirectories;
        erofs_nid_t nid;
        const char *inode_path;
+       unsigned int show_ext_count;
 };
 static struct erofsdump_cfg dumpcfg;
 
 static const char chart_format[] = "%-16s      %-11d %8.2f%% |%-50s|\n";
 static const char header_format[] = "%-16s %11s %16s |%-50s|\n";
-static char *file_types[] = {
-       ".txt", ".so", ".xml", ".apk",
-       ".odex", ".vdex", ".oat", ".rc",
-       ".otf", ".txt", "others",
+
+struct postfix_statistics {
+       struct hashmap_entry ent;
+       char postfix[16];
+       unsigned int count;
+       unsigned long occupied_size;
+       unsigned long original_size;
 };
-#define OTHERFILETYPE  ARRAY_SIZE(file_types)
+
+static int erofs_postfix_hashmap_cmp(const void *a, const void *b,
+                                 const void *key)
+{
+       const struct postfix_statistics *ps1 =
+                       container_of((struct hashmap_entry *)a,
+                                    struct postfix_statistics, ent);
+       const struct postfix_statistics *ps2 =
+                       container_of((struct hashmap_entry *)b,
+                                    struct postfix_statistics, ent);
+
+       return strncmp(ps1->postfix, key ? key : ps2->postfix, 
sizeof(ps1->postfix));
+}
+
 /* (1 << FILE_MAX_SIZE_BITS)KB */
 #define        FILE_MAX_SIZE_BITS      16
 
@@ -65,7 +84,7 @@ struct erofs_statistics {
        /* [statistics] # of files based on inode_info->flags */
        unsigned long file_category_stat[EROFS_FT_MAX];
        /* [statistics] # of files based on file name extensions */
-       unsigned int file_type_stat[OTHERFILETYPE];
+       struct hashmap postfix_hashmap;
        /* [statistics] # of files based on the original size of files */
        unsigned int file_original_size[FILE_MAX_SIZE_BITS + 1];
        /* [statistics] # of files based on the compressed size of files */
@@ -79,6 +98,7 @@ static struct option long_options[] = {
        {"device", required_argument, NULL, 3},
        {"path", required_argument, NULL, 4},
        {"ls", no_argument, NULL, 5},
+       {"ext", required_argument, NULL, 6},
        {0, 0, 0, 0},
 };
 
@@ -111,6 +131,7 @@ static void usage(void)
              " --ls            show directory contents (INODE required)\n"
              " --nid=#         show the target inode info of nid #\n"
              " --path=X        show the target inode info of path X\n"
+             " --ext=#         show the top # extension file info\n"
              " --help          display this help and exit.\n",
              stderr);
 }
@@ -164,6 +185,9 @@ static int erofsdump_parse_options_cfg(int argc, char 
**argv)
                case 5:
                        dumpcfg.show_subdirectories = true;
                        break;
+               case 6:
+                       dumpcfg.show_ext_count = atoi(optarg);
+                       break;
                default:
                        return -EINVAL;
                }
@@ -208,20 +232,41 @@ static int erofsdump_get_occupied_size(struct erofs_inode 
*inode,
        return 0;
 }
 
-static void inc_file_extension_count(const char *dname, unsigned int len)
+static void inc_file_extension_count(const char *dname, unsigned int len,
+               unsigned long occupied_size, unsigned long original_size)
 {
        char *postfix = memrchr(dname, '.', len);
-       int type;
+       unsigned int hash, plen;
+       struct postfix_statistics *ps;
+       char pf[sizeof(ps->postfix)] = {0};
+
+       plen = len - (postfix - dname);
+       if (plen > sizeof(ps->postfix))
+               plen = sizeof(ps->postfix);
+       if (postfix) {
+               memcpy(pf, postfix, plen);
+               hash = strhash(pf);
+               ps = hashmap_get_from_hash(&stats.postfix_hashmap, hash, pf);
+               if (ps) {
+                       ps->count++;
+                       ps->occupied_size += occupied_size;
+                       ps->original_size += original_size;
+                       return;
+               }
+               ps = malloc(sizeof(struct postfix_statistics));
+               if (!ps) {
+                       erofs_err("memory allocation failed!");
+                       return;
+               }
 
-       if (!postfix) {
-               type = OTHERFILETYPE - 1;
-       } else {
-               for (type = 0; type < OTHERFILETYPE - 1; ++type)
-                       if (!strncmp(postfix, file_types[type],
-                                    len - (postfix - dname)))
-                               break;
+               ps->count = 1;
+               ps->occupied_size = occupied_size;
+               ps->original_size = original_size;
+               memset(ps->postfix, 0, sizeof(ps->postfix));
+               strncpy(ps->postfix, pf, plen);
+               hashmap_entry_init(&ps->ent, hash);
+               hashmap_add(&stats.postfix_hashmap, ps);
        }
-       ++stats.file_type_stat[type];
 }
 
 static void update_file_size_statatics(erofs_off_t occupied_size,
@@ -298,7 +343,7 @@ static int erofsdump_readdir(struct erofs_dir_context *ctx)
 
        if (S_ISREG(vi.i_mode)) {
                stats.files_total_origin_size += vi.i_size;
-               inc_file_extension_count(ctx->dname, ctx->de_namelen);
+               inc_file_extension_count(ctx->dname, ctx->de_namelen, 
occupied_size, vi.i_size);
                stats.files_total_size += occupied_size;
                update_file_size_statatics(occupied_size, vi.i_size);
        }
@@ -481,27 +526,50 @@ static void erofsdump_filesize_distribution(const char 
*title,
        }
 }
 
-static void erofsdump_filetype_distribution(char **file_types, unsigned int 
len)
+static int comp_postfix_statistics(const void *a, const void *b)
+{
+       const struct postfix_statistics *psa, *psb;
+
+       psa = a;
+       psb = b;
+       return psa->count < psb->count ? 1 :
+               (psa->count > psb->count) ? -1 : 0;
+}
+
+static void erofsdump_filetype_distribution(int topk)
 {
        char col1[30];
-       unsigned int col2, i;
-       double col3;
+       unsigned int col2, i, pos;
+       double col3, compression_rate;
        char col4[401];
-
+       struct postfix_statistics *ps_array;
+       struct postfix_statistics *ps;
+       struct hashmap_iter iter;
+
+       pos = 0;
+       ps_array = malloc(sizeof(struct postfix_statistics) * 
stats.postfix_hashmap.size);
+       hashmap_iter_init(&stats.postfix_hashmap, &iter);
+       while ((ps = hashmap_iter_next(&iter)))
+               ps_array[pos++] = *ps;
+
+       DBG_BUGON(pos != stats.postfix_hashmap.size);
+       qsort(ps_array, pos, sizeof(struct postfix_statistics),
+                       comp_postfix_statistics);
        fprintf(stdout, "\nFile type distribution:\n");
        fprintf(stdout, header_format, "type", "count", "ratio",
-                       "distribution");
-       for (i = 0; i < len; i++) {
+                       "compression rate");
+       for (i = 0; i < topk && i < pos; i++) {
                memset(col1, 0, sizeof(col1));
-               memset(col4, 0, sizeof(col4));
-               sprintf(col1, "%-17s", file_types[i]);
-               col2 = stats.file_type_stat[i];
+               sprintf(col1, "%-17s", ps_array[i].postfix);
+               col2 = ps_array[i].count;
                if (stats.file_category_stat[EROFS_FT_REG_FILE])
                        col3 = (double)(100 * col2) /
                                stats.file_category_stat[EROFS_FT_REG_FILE];
                else
                        col3 = 0.0;
-               memset(col4, '#', col3 / 2);
+               compression_rate = 100.0 * (double)ps_array[i].occupied_size /
+                               (double)ps_array[i].original_size;
+               sprintf(col4, "%.2f%%", compression_rate);
                fprintf(stdout, chart_format, col1, col2, col3, col4);
        }
 }
@@ -543,6 +611,7 @@ static void erofsdump_print_statistic(void)
                .de_namelen = 0,
        };
 
+       hashmap_init(&stats.postfix_hashmap, erofs_postfix_hashmap_cmp, 0);
        err = erofsdump_readdir(&ctx);
        if (err) {
                erofs_err("read dir failed");
@@ -555,7 +624,7 @@ static void erofsdump_print_statistic(void)
        erofsdump_filesize_distribution("On-disk",
                        stats.file_comp_size,
                        ARRAY_SIZE(stats.file_comp_size));
-       erofsdump_filetype_distribution(file_types, OTHERFILETYPE);
+       erofsdump_filetype_distribution(dumpcfg.show_ext_count);
 }
 
 static void erofsdump_show_superblock(void)
@@ -624,9 +693,11 @@ int main(int argc, char **argv)
        if (dumpcfg.show_superblock)
                erofsdump_show_superblock();
 
-       if (dumpcfg.show_statistics)
+       if (dumpcfg.show_statistics) {
+               if (dumpcfg.show_ext_count == 0)
+                       dumpcfg.show_ext_count = 10;
                erofsdump_print_statistic();
-
+       }
        if (dumpcfg.show_extent && !dumpcfg.show_inode) {
                usage();
                goto exit_dev_close;
-- 
2.30.2

Reply via email to