Let's read a pack to feed our dictionary with all the path strings
contained in all the tree objects.

Dump the resulting dictionary sorted by frequency to stdout.

Signed-off-by: Nicolas Pitre <n...@fluxnic.net>
---
 Makefile        |   1 +
 packv4-create.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)

diff --git a/Makefile b/Makefile
index 3588ca1..4716113 100644
--- a/Makefile
+++ b/Makefile
@@ -550,6 +550,7 @@ PROGRAM_OBJS += shell.o
 PROGRAM_OBJS += show-index.o
 PROGRAM_OBJS += upload-pack.o
 PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += packv4-create.o
 
 # Binary suffix, set to .exe for Windows builds
 X =
diff --git a/packv4-create.c b/packv4-create.c
index 2de6d41..00762a5 100644
--- a/packv4-create.c
+++ b/packv4-create.c
@@ -9,6 +9,8 @@
  */
 
 #include "cache.h"
+#include "object.h"
+#include "tree-walk.h"
 
 struct data_entry {
        unsigned offset;
@@ -125,6 +127,22 @@ static void sort_dict_entries_by_hits(struct dict_table *t)
        rehash_entries(t);
 }
 
+static struct dict_table *tree_path_table;
+
+static int add_tree_dict_entries(void *buf, unsigned long size)
+{
+       struct tree_desc desc;
+       struct name_entry name_entry;
+
+       if (!tree_path_table)
+               tree_path_table = create_dict_table();
+
+       init_tree_desc(&desc, buf, size);
+       while (tree_entry(&desc, &name_entry))
+               dict_add_entry(tree_path_table, name_entry.path);
+       return 0;
+}
+
 void dict_dump(struct dict_table *t)
 {
        int i;
@@ -135,3 +153,122 @@ void dict_dump(struct dict_table *t)
                        t->entry[i].hits,
                        t->data + t->entry[i].offset);
 }
+
+struct idx_entry
+{
+       off_t                offset;
+       const unsigned char *sha1;
+};
+
+static int sort_by_offset(const void *e1, const void *e2)
+{
+       const struct idx_entry *entry1 = e1;
+       const struct idx_entry *entry2 = e2;
+       if (entry1->offset < entry2->offset)
+               return -1;
+       if (entry1->offset > entry2->offset)
+               return 1;
+       return 0;
+}
+static int create_pack_dictionaries(struct packed_git *p)
+{
+       uint32_t nr_objects, i;
+       struct idx_entry *objects;
+
+       nr_objects = p->num_objects;
+       objects = xmalloc((nr_objects + 1) * sizeof(*objects));
+       objects[nr_objects].offset = p->index_size - 40;
+       for (i = 0; i < nr_objects; i++) {
+               objects[i].sha1 = nth_packed_object_sha1(p, i);
+               objects[i].offset = nth_packed_object_offset(p, i);
+       }
+       qsort(objects, nr_objects, sizeof(*objects), sort_by_offset);
+
+       for (i = 0; i < nr_objects; i++) {
+               void *data;
+               enum object_type type;
+               unsigned long size;
+               struct object_info oi = {};
+
+               oi.typep = &type;
+               oi.sizep = &size;
+               if (packed_object_info(p, objects[i].offset, &oi) < 0)
+                       die("cannot get type of %s from %s",
+                           sha1_to_hex(objects[i].sha1), p->pack_name);
+
+               switch (type) {
+               case OBJ_TREE:
+                       break;
+               default:
+                       continue;
+               }
+               data = unpack_entry(p, objects[i].offset, &type, &size);
+               if (!data)
+                       die("cannot unpack %s from %s",
+                           sha1_to_hex(objects[i].sha1), p->pack_name);
+               if (check_sha1_signature(objects[i].sha1, data, size, 
typename(type)))
+                       die("packed %s from %s is corrupt",
+                           sha1_to_hex(objects[i].sha1), p->pack_name);
+               if (add_tree_dict_entries(data, size) < 0)
+                       die("can't process %s object %s",
+                               typename(type), sha1_to_hex(objects[i].sha1));
+               free(data);
+       }
+       free(objects);
+
+       return 0;
+}
+
+static int process_one_pack(const char *path)
+{
+       char arg[PATH_MAX];
+       int len;
+       struct packed_git *p;
+
+       len = strlcpy(arg, path, PATH_MAX);
+       if (len >= PATH_MAX)
+               return error("name too long: %s", path);
+
+       /*
+        * In addition to "foo.idx" we accept "foo.pack" and "foo";
+        * normalize these forms to "foo.idx" for add_packed_git().
+        */
+       if (has_extension(arg, ".pack")) {
+               strcpy(arg + len - 5, ".idx");
+               len--;
+       } else if (!has_extension(arg, ".idx")) {
+               if (len + 4 >= PATH_MAX)
+                       return error("name too long: %s.idx", arg);
+               strcpy(arg + len, ".idx");
+               len += 4;
+       }
+
+       /*
+        * add_packed_git() uses our buffer (containing "foo.idx") to
+        * build the pack filename ("foo.pack").  Make sure it fits.
+        */
+       if (len + 1 >= PATH_MAX) {
+               arg[len - 4] = '\0';
+               return error("name too long: %s.pack", arg);
+       }
+
+       p = add_packed_git(arg, len, 1);
+       if (!p)
+               return error("packfile %s not found.", arg);
+
+       install_packed_git(p);
+       if (open_pack_index(p))
+               return error("packfile %s index not opened", p->pack_name);
+       return create_pack_dictionaries(p);
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc != 2) {
+               fprintf(stderr, "Usage: %s <packfile>\n", argv[0]);
+               exit(1);
+       }
+       process_one_pack(argv[1]);
+       dict_dump(tree_path_table);
+       return 0;
+}
-- 
1.8.4.38.g317e65b

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to