Re: [PATCH v1 4/4] hashmap: add string interning API

2014-07-07 Thread Junio C Hamano
Karsten Blees karsten.bl...@gmail.com writes:

 Interning short strings with high probability of duplicates can reduce the
 memory footprint and speed up comparisons.

 Add strintern() and memintern() APIs that use a hashmap to manage the pool
 of unique, interned strings.

 Note: strintern(getenv()) could be used to sanitize git's use of getenv(),
 in case we ever encounter a platform where a call to getenv() invalidates
 previous getenv() results (which is allowed by POSIX).

I think the attribute name/value strings are already interned, so
they may want to be converted to use this (or vice-versa).


 Signed-off-by: Karsten Blees bl...@dcon.de
 ---
  Documentation/technical/api-hashmap.txt | 15 +
  hashmap.c   | 38 
 +
  hashmap.h   |  8 +++
  t/t0011-hashmap.sh  | 13 +++
  test-hashmap.c  | 14 
  5 files changed, 88 insertions(+)

 diff --git a/Documentation/technical/api-hashmap.txt 
 b/Documentation/technical/api-hashmap.txt
 index f9215d6..00c4c29 100644
 --- a/Documentation/technical/api-hashmap.txt
 +++ b/Documentation/technical/api-hashmap.txt
 @@ -193,6 +193,21 @@ more entries.
  `hashmap_iter_first` is a combination of both (i.e. initializes the iterator
  and returns the first entry, if any).
  
 +`const char *strintern(const char *string)`::
 +`const void *memintern(const void *data, size_t len)`::
 +
 + Returns the unique, interned version of the specified string or data,
 + similar to the `String.intern` API in Java and .NET, respectively.
 + Interned strings remain valid for the entire lifetime of the process.
 ++
 +Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
 +strings / data must not be modified or freed.
 ++
 +Interned strings are best used for short strings with high probability of
 +duplicates.
 ++
 +Uses a hashmap to store the pool of interned strings.
 +
  Usage example
  -
  
 diff --git a/hashmap.c b/hashmap.c
 index d1b8056..f693839 100644
 --- a/hashmap.c
 +++ b/hashmap.c
 @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
   current = iter-map-table[iter-tablepos++];
   }
  }
 +
 +struct pool_entry {
 + struct hashmap_entry ent;
 + size_t len;
 + unsigned char data[FLEX_ARRAY];
 +};
 +
 +static int pool_entry_cmp(const struct pool_entry *e1,
 +   const struct pool_entry *e2,
 +   const unsigned char *keydata)
 +{
 + return e1-data != keydata 
 +(e1-len != e2-len || memcmp(e1-data, keydata, e1-len));
 +}
 +
 +const void *memintern(const void *data, size_t len)
 +{
 + static struct hashmap map;
 + struct pool_entry key, *e;
 +
 + /* initialize string pool hashmap */
 + if (!map.tablesize)
 + hashmap_init(map, (hashmap_cmp_fn) pool_entry_cmp, 0);
 +
 + /* lookup interned string in pool */
 + hashmap_entry_init(key, memhash(data, len));
 + key.len = len;
 + e = hashmap_get(map, key, data);
 + if (!e) {
 + /* not found: create it */
 + e = xmallocz(sizeof(struct pool_entry) + len);
 + hashmap_entry_init(e, key.ent.hash);
 + e-len = len;
 + memcpy(e-data, data, len);
 + hashmap_add(map, e);
 + }
 + return e-data;
 +}
 diff --git a/hashmap.h b/hashmap.h
 index 12f0668..507884b 100644
 --- a/hashmap.h
 +++ b/hashmap.h
 @@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
   return hashmap_iter_next(iter);
  }
  
 +/* string interning */
 +
 +extern const void *memintern(const void *data, size_t len);
 +static inline const char *strintern(const char *string)
 +{
 + return memintern(string, strlen(string));
 +}
 +
  #endif
 diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh
 index 391e2b6..f97c805 100755
 --- a/t/t0011-hashmap.sh
 +++ b/t/t0011-hashmap.sh
 @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
  
  '
  
 +test_expect_success 'string interning' '
 +
 +test_hashmap intern value1
 +intern Value1
 +intern value2
 +intern value2
 + value1
 +Value1
 +value2
 +value2
 +
 +'
 +
  test_done
 diff --git a/test-hashmap.c b/test-hashmap.c
 index 3c9f67b..07aa7ec 100644
 --- a/test-hashmap.c
 +++ b/test-hashmap.c
 @@ -234,6 +234,20 @@ int main(int argc, char *argv[])
   /* print table sizes */
   printf(%u %u\n, map.tablesize, map.size);
  
 + } else if (!strcmp(intern, cmd)  l1) {
 +
 + /* test that strintern works */
 + const char *i1 = strintern(p1);
 + const char *i2 = strintern(p1);
 + if (strcmp(i1, p1))
 + printf(strintern(%s) returns %s\n, p1, i1);
 + else if (i1 == p1)
 + 

Re: [PATCH v1 4/4] hashmap: add string interning API

2014-07-03 Thread Matthieu Moy
Karsten Blees karsten.bl...@gmail.com writes:

 --- a/t/t0011-hashmap.sh
 +++ b/t/t0011-hashmap.sh
 @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
  
  '
  
 +test_expect_success 'string interning' '
 +
 +test_hashmap intern value1
 +intern Value1
 +intern value2
 +intern value2
 + value1
 +Value1
 +value2
 +value2
 +
 +'

Indentation is broken here, but it's consistant with the current style
of the file. You may want a modernize style patch before yours and
then indent properly, but that shouldn't block inclusion.

-- 
Matthieu Moy
http://www-verimag.imag.fr/~moy/
--
To unsubscribe from this list: send the line unsubscribe git in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 4/4] hashmap: add string interning API

2014-07-02 Thread Karsten Blees
Interning short strings with high probability of duplicates can reduce the
memory footprint and speed up comparisons.

Add strintern() and memintern() APIs that use a hashmap to manage the pool
of unique, interned strings.

Note: strintern(getenv()) could be used to sanitize git's use of getenv(),
in case we ever encounter a platform where a call to getenv() invalidates
previous getenv() results (which is allowed by POSIX).

Signed-off-by: Karsten Blees bl...@dcon.de
---
 Documentation/technical/api-hashmap.txt | 15 +
 hashmap.c   | 38 +
 hashmap.h   |  8 +++
 t/t0011-hashmap.sh  | 13 +++
 test-hashmap.c  | 14 
 5 files changed, 88 insertions(+)

diff --git a/Documentation/technical/api-hashmap.txt 
b/Documentation/technical/api-hashmap.txt
index f9215d6..00c4c29 100644
--- a/Documentation/technical/api-hashmap.txt
+++ b/Documentation/technical/api-hashmap.txt
@@ -193,6 +193,21 @@ more entries.
 `hashmap_iter_first` is a combination of both (i.e. initializes the iterator
 and returns the first entry, if any).
 
+`const char *strintern(const char *string)`::
+`const void *memintern(const void *data, size_t len)`::
+
+   Returns the unique, interned version of the specified string or data,
+   similar to the `String.intern` API in Java and .NET, respectively.
+   Interned strings remain valid for the entire lifetime of the process.
++
+Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned
+strings / data must not be modified or freed.
++
+Interned strings are best used for short strings with high probability of
+duplicates.
++
+Uses a hashmap to store the pool of interned strings.
+
 Usage example
 -
 
diff --git a/hashmap.c b/hashmap.c
index d1b8056..f693839 100644
--- a/hashmap.c
+++ b/hashmap.c
@@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter)
current = iter-map-table[iter-tablepos++];
}
 }
+
+struct pool_entry {
+   struct hashmap_entry ent;
+   size_t len;
+   unsigned char data[FLEX_ARRAY];
+};
+
+static int pool_entry_cmp(const struct pool_entry *e1,
+ const struct pool_entry *e2,
+ const unsigned char *keydata)
+{
+   return e1-data != keydata 
+  (e1-len != e2-len || memcmp(e1-data, keydata, e1-len));
+}
+
+const void *memintern(const void *data, size_t len)
+{
+   static struct hashmap map;
+   struct pool_entry key, *e;
+
+   /* initialize string pool hashmap */
+   if (!map.tablesize)
+   hashmap_init(map, (hashmap_cmp_fn) pool_entry_cmp, 0);
+
+   /* lookup interned string in pool */
+   hashmap_entry_init(key, memhash(data, len));
+   key.len = len;
+   e = hashmap_get(map, key, data);
+   if (!e) {
+   /* not found: create it */
+   e = xmallocz(sizeof(struct pool_entry) + len);
+   hashmap_entry_init(e, key.ent.hash);
+   e-len = len;
+   memcpy(e-data, data, len);
+   hashmap_add(map, e);
+   }
+   return e-data;
+}
diff --git a/hashmap.h b/hashmap.h
index 12f0668..507884b 100644
--- a/hashmap.h
+++ b/hashmap.h
@@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map,
return hashmap_iter_next(iter);
 }
 
+/* string interning */
+
+extern const void *memintern(const void *data, size_t len);
+static inline const char *strintern(const char *string)
+{
+   return memintern(string, strlen(string));
+}
+
 #endif
diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh
index 391e2b6..f97c805 100755
--- a/t/t0011-hashmap.sh
+++ b/t/t0011-hashmap.sh
@@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' '
 
 '
 
+test_expect_success 'string interning' '
+
+test_hashmap intern value1
+intern Value1
+intern value2
+intern value2
+ value1
+Value1
+value2
+value2
+
+'
+
 test_done
diff --git a/test-hashmap.c b/test-hashmap.c
index 3c9f67b..07aa7ec 100644
--- a/test-hashmap.c
+++ b/test-hashmap.c
@@ -234,6 +234,20 @@ int main(int argc, char *argv[])
/* print table sizes */
printf(%u %u\n, map.tablesize, map.size);
 
+   } else if (!strcmp(intern, cmd)  l1) {
+
+   /* test that strintern works */
+   const char *i1 = strintern(p1);
+   const char *i2 = strintern(p1);
+   if (strcmp(i1, p1))
+   printf(strintern(%s) returns %s\n, p1, i1);
+   else if (i1 == p1)
+   printf(strintern(%s) returns input pointer\n, 
p1);
+   else if (i1 != i2)
+   printf(strintern(%s) != strintern(%s), i1, 
i2);
+   else
+