Re: [PATCH v1 4/4] hashmap: add string interning API
Karsten Blees karsten.bl...@gmail.com writes: Interning short strings with high probability of duplicates can reduce the memory footprint and speed up comparisons. Add strintern() and memintern() APIs that use a hashmap to manage the pool of unique, interned strings. Note: strintern(getenv()) could be used to sanitize git's use of getenv(), in case we ever encounter a platform where a call to getenv() invalidates previous getenv() results (which is allowed by POSIX). I think the attribute name/value strings are already interned, so they may want to be converted to use this (or vice-versa). Signed-off-by: Karsten Blees bl...@dcon.de --- Documentation/technical/api-hashmap.txt | 15 + hashmap.c | 38 + hashmap.h | 8 +++ t/t0011-hashmap.sh | 13 +++ test-hashmap.c | 14 5 files changed, 88 insertions(+) diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt index f9215d6..00c4c29 100644 --- a/Documentation/technical/api-hashmap.txt +++ b/Documentation/technical/api-hashmap.txt @@ -193,6 +193,21 @@ more entries. `hashmap_iter_first` is a combination of both (i.e. initializes the iterator and returns the first entry, if any). +`const char *strintern(const char *string)`:: +`const void *memintern(const void *data, size_t len)`:: + + Returns the unique, interned version of the specified string or data, + similar to the `String.intern` API in Java and .NET, respectively. + Interned strings remain valid for the entire lifetime of the process. ++ +Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned +strings / data must not be modified or freed. ++ +Interned strings are best used for short strings with high probability of +duplicates. ++ +Uses a hashmap to store the pool of interned strings. + Usage example - diff --git a/hashmap.c b/hashmap.c index d1b8056..f693839 100644 --- a/hashmap.c +++ b/hashmap.c @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter) current = iter-map-table[iter-tablepos++]; } } + +struct pool_entry { + struct hashmap_entry ent; + size_t len; + unsigned char data[FLEX_ARRAY]; +}; + +static int pool_entry_cmp(const struct pool_entry *e1, + const struct pool_entry *e2, + const unsigned char *keydata) +{ + return e1-data != keydata +(e1-len != e2-len || memcmp(e1-data, keydata, e1-len)); +} + +const void *memintern(const void *data, size_t len) +{ + static struct hashmap map; + struct pool_entry key, *e; + + /* initialize string pool hashmap */ + if (!map.tablesize) + hashmap_init(map, (hashmap_cmp_fn) pool_entry_cmp, 0); + + /* lookup interned string in pool */ + hashmap_entry_init(key, memhash(data, len)); + key.len = len; + e = hashmap_get(map, key, data); + if (!e) { + /* not found: create it */ + e = xmallocz(sizeof(struct pool_entry) + len); + hashmap_entry_init(e, key.ent.hash); + e-len = len; + memcpy(e-data, data, len); + hashmap_add(map, e); + } + return e-data; +} diff --git a/hashmap.h b/hashmap.h index 12f0668..507884b 100644 --- a/hashmap.h +++ b/hashmap.h @@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map, return hashmap_iter_next(iter); } +/* string interning */ + +extern const void *memintern(const void *data, size_t len); +static inline const char *strintern(const char *string) +{ + return memintern(string, strlen(string)); +} + #endif diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh index 391e2b6..f97c805 100755 --- a/t/t0011-hashmap.sh +++ b/t/t0011-hashmap.sh @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' ' ' +test_expect_success 'string interning' ' + +test_hashmap intern value1 +intern Value1 +intern value2 +intern value2 + value1 +Value1 +value2 +value2 + +' + test_done diff --git a/test-hashmap.c b/test-hashmap.c index 3c9f67b..07aa7ec 100644 --- a/test-hashmap.c +++ b/test-hashmap.c @@ -234,6 +234,20 @@ int main(int argc, char *argv[]) /* print table sizes */ printf(%u %u\n, map.tablesize, map.size); + } else if (!strcmp(intern, cmd) l1) { + + /* test that strintern works */ + const char *i1 = strintern(p1); + const char *i2 = strintern(p1); + if (strcmp(i1, p1)) + printf(strintern(%s) returns %s\n, p1, i1); + else if (i1 == p1) +
Re: [PATCH v1 4/4] hashmap: add string interning API
Karsten Blees karsten.bl...@gmail.com writes: --- a/t/t0011-hashmap.sh +++ b/t/t0011-hashmap.sh @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' ' ' +test_expect_success 'string interning' ' + +test_hashmap intern value1 +intern Value1 +intern value2 +intern value2 + value1 +Value1 +value2 +value2 + +' Indentation is broken here, but it's consistant with the current style of the file. You may want a modernize style patch before yours and then indent properly, but that shouldn't block inclusion. -- Matthieu Moy http://www-verimag.imag.fr/~moy/ -- To unsubscribe from this list: send the line unsubscribe git in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v1 4/4] hashmap: add string interning API
Interning short strings with high probability of duplicates can reduce the memory footprint and speed up comparisons. Add strintern() and memintern() APIs that use a hashmap to manage the pool of unique, interned strings. Note: strintern(getenv()) could be used to sanitize git's use of getenv(), in case we ever encounter a platform where a call to getenv() invalidates previous getenv() results (which is allowed by POSIX). Signed-off-by: Karsten Blees bl...@dcon.de --- Documentation/technical/api-hashmap.txt | 15 + hashmap.c | 38 + hashmap.h | 8 +++ t/t0011-hashmap.sh | 13 +++ test-hashmap.c | 14 5 files changed, 88 insertions(+) diff --git a/Documentation/technical/api-hashmap.txt b/Documentation/technical/api-hashmap.txt index f9215d6..00c4c29 100644 --- a/Documentation/technical/api-hashmap.txt +++ b/Documentation/technical/api-hashmap.txt @@ -193,6 +193,21 @@ more entries. `hashmap_iter_first` is a combination of both (i.e. initializes the iterator and returns the first entry, if any). +`const char *strintern(const char *string)`:: +`const void *memintern(const void *data, size_t len)`:: + + Returns the unique, interned version of the specified string or data, + similar to the `String.intern` API in Java and .NET, respectively. + Interned strings remain valid for the entire lifetime of the process. ++ +Can be used as `[x]strdup()` or `xmemdupz` replacement, except that interned +strings / data must not be modified or freed. ++ +Interned strings are best used for short strings with high probability of +duplicates. ++ +Uses a hashmap to store the pool of interned strings. + Usage example - diff --git a/hashmap.c b/hashmap.c index d1b8056..f693839 100644 --- a/hashmap.c +++ b/hashmap.c @@ -226,3 +226,41 @@ void *hashmap_iter_next(struct hashmap_iter *iter) current = iter-map-table[iter-tablepos++]; } } + +struct pool_entry { + struct hashmap_entry ent; + size_t len; + unsigned char data[FLEX_ARRAY]; +}; + +static int pool_entry_cmp(const struct pool_entry *e1, + const struct pool_entry *e2, + const unsigned char *keydata) +{ + return e1-data != keydata + (e1-len != e2-len || memcmp(e1-data, keydata, e1-len)); +} + +const void *memintern(const void *data, size_t len) +{ + static struct hashmap map; + struct pool_entry key, *e; + + /* initialize string pool hashmap */ + if (!map.tablesize) + hashmap_init(map, (hashmap_cmp_fn) pool_entry_cmp, 0); + + /* lookup interned string in pool */ + hashmap_entry_init(key, memhash(data, len)); + key.len = len; + e = hashmap_get(map, key, data); + if (!e) { + /* not found: create it */ + e = xmallocz(sizeof(struct pool_entry) + len); + hashmap_entry_init(e, key.ent.hash); + e-len = len; + memcpy(e-data, data, len); + hashmap_add(map, e); + } + return e-data; +} diff --git a/hashmap.h b/hashmap.h index 12f0668..507884b 100644 --- a/hashmap.h +++ b/hashmap.h @@ -87,4 +87,12 @@ static inline void *hashmap_iter_first(struct hashmap *map, return hashmap_iter_next(iter); } +/* string interning */ + +extern const void *memintern(const void *data, size_t len); +static inline const char *strintern(const char *string) +{ + return memintern(string, strlen(string)); +} + #endif diff --git a/t/t0011-hashmap.sh b/t/t0011-hashmap.sh index 391e2b6..f97c805 100755 --- a/t/t0011-hashmap.sh +++ b/t/t0011-hashmap.sh @@ -237,4 +237,17 @@ test_expect_success 'grow / shrink' ' ' +test_expect_success 'string interning' ' + +test_hashmap intern value1 +intern Value1 +intern value2 +intern value2 + value1 +Value1 +value2 +value2 + +' + test_done diff --git a/test-hashmap.c b/test-hashmap.c index 3c9f67b..07aa7ec 100644 --- a/test-hashmap.c +++ b/test-hashmap.c @@ -234,6 +234,20 @@ int main(int argc, char *argv[]) /* print table sizes */ printf(%u %u\n, map.tablesize, map.size); + } else if (!strcmp(intern, cmd) l1) { + + /* test that strintern works */ + const char *i1 = strintern(p1); + const char *i2 = strintern(p1); + if (strcmp(i1, p1)) + printf(strintern(%s) returns %s\n, p1, i1); + else if (i1 == p1) + printf(strintern(%s) returns input pointer\n, p1); + else if (i1 != i2) + printf(strintern(%s) != strintern(%s), i1, i2); + else +