On 8/28/2017 4:28 PM, Kevin Willford wrote:
The code was using two string_lists, one for the directories and
one for the files.  The code never checks the lists independently
so we should be able to only use one list.  The string_list also
is a O(log n) for lookup and insertion.  Switching this to use a
hashmap will give O(1) which will save some time when there are
millions of paths that will be checked.


Good observation that the two string lists were never used independently. Perhaps the intent was to help the O(log n) issue by splitting them up? A single hashmap is much faster. Looks good.


Signed-off-by: Kevin Willford <kewi...@microsoft.com>
---
  merge-recursive.c | 48 +++++++++++++++++++++++++++++++++++++-----------
  merge-recursive.h |  3 +--
  2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/merge-recursive.c b/merge-recursive.c
index d47f40ea81..ef4fe5f09f 100644
--- a/merge-recursive.c
+++ b/merge-recursive.c
@@ -24,6 +24,26 @@
  #include "dir.h"
  #include "submodule.h"
+struct path_hashmap_entry {
+       struct hashmap_entry;
+       char path[FLEX_ARRAY];
+};
+
+static unsigned int (*pathhash)(const char *path);
+static int (*pathcmp)(const char *a, const char *b);
+
+static int path_hashmap_cmp(const void *cmp_data,
+                           const void *entry,
+                           const void *entry_or_key,
+                           const void *keydata)
+{
+       const struct path_hashmap_entry *a = entry;
+       const struct path_hashmap_entry *b = entry_or_key;
+       const char *key = keydata;
+
+       return pathcmp(a->path, key ? key : b->path);
+}
+
  static void flush_output(struct merge_options *o)
  {
        if (o->buffer_output < 2 && o->obuf.len) {
@@ -314,15 +334,15 @@ static int save_files_dirs(const unsigned char *sha1,
                struct strbuf *base, const char *path,
                unsigned int mode, int stage, void *context)
  {
+       struct path_hashmap_entry *entry;
        int baselen = base->len;
        struct merge_options *o = context;
strbuf_addstr(base, path); - if (S_ISDIR(mode))
-               string_list_insert(&o->current_directory_set, base->buf);
-       else
-               string_list_insert(&o->current_file_set, base->buf);
+       FLEX_ALLOC_MEM(entry, path, base->buf, base->len);
+       hashmap_entry_init(entry, pathhash(entry->path));
+       hashmap_add(&o->current_file_dir_set, entry);
strbuf_setlen(base, baselen);
        return (S_ISDIR(mode) ? READ_TREE_RECURSIVE : 0);
@@ -642,6 +662,8 @@ static void add_flattened_path(struct strbuf *out, const 
char *s)
static char *unique_path(struct merge_options *o, const char *path, const char *branch)
  {
+       struct path_hashmap_entry dummy;
+       struct path_hashmap_entry *entry;
        struct strbuf newpath = STRBUF_INIT;
        int suffix = 0;
        size_t base_len;
@@ -650,14 +672,17 @@ static char *unique_path(struct merge_options *o, const 
char *path, const char *
        add_flattened_path(&newpath, branch);
base_len = newpath.len;
-       while (string_list_has_string(&o->current_file_set, newpath.buf) ||
-              string_list_has_string(&o->current_directory_set, newpath.buf) ||
+       hashmap_entry_init(&dummy, pathhash(newpath.buf));
+       while (hashmap_get(&o->current_file_dir_set, &dummy, newpath.buf) ||
               (!o->call_depth && file_exists(newpath.buf))) {
                strbuf_setlen(&newpath, base_len);
                strbuf_addf(&newpath, "_%d", suffix++);
+               hashmap_entry_init(&dummy, pathhash(newpath.buf));
        }
- string_list_insert(&o->current_file_set, newpath.buf);
+       FLEX_ALLOC_MEM(entry, path, newpath.buf, newpath.len);
+       hashmap_entry_init(entry, pathhash(entry->path));
+       hashmap_add(&o->current_file_dir_set, entry);
        return strbuf_detach(&newpath, NULL);
  }
@@ -1941,8 +1966,7 @@ int merge_trees(struct merge_options *o,
        if (unmerged_cache()) {
                struct string_list *entries, *re_head, *re_merge;
                int i;
-               string_list_clear(&o->current_file_set, 1);
-               string_list_clear(&o->current_directory_set, 1);
+               hashmap_init(&o->current_file_dir_set, path_hashmap_cmp, NULL, 
512);
                get_files_dirs(o, head);
                get_files_dirs(o, merge);
@@ -1978,6 +2002,8 @@ int merge_trees(struct merge_options *o,
                string_list_clear(re_head, 0);
                string_list_clear(entries, 1);
+ hashmap_free(&o->current_file_dir_set, 1);
+
                free(re_merge);
                free(re_head);
                free(entries);
@@ -2179,8 +2205,8 @@ void init_merge_options(struct merge_options *o)
        if (o->verbosity >= 5)
                o->buffer_output = 0;
        strbuf_init(&o->obuf, 0);
-       string_list_init(&o->current_file_set, 1);
-       string_list_init(&o->current_directory_set, 1);
+       pathhash = ignore_case ? strihash : strhash;
+       pathcmp = ignore_case ? strcasecmp : strcmp;
        string_list_init(&o->df_conflict_file_set, 1);
  }
diff --git a/merge-recursive.h b/merge-recursive.h
index 735343b413..80d69d1401 100644
--- a/merge-recursive.h
+++ b/merge-recursive.h
@@ -25,8 +25,7 @@ struct merge_options {
        int show_rename_progress;
        int call_depth;
        struct strbuf obuf;
-       struct string_list current_file_set;
-       struct string_list current_directory_set;
+       struct hashmap current_file_dir_set;
        struct string_list df_conflict_file_set;
  };

Reply via email to