Changeset: 4d68a687d4aa for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4d68a687d4aa
Modified Files:
        gdk/gdk_string.c
Branch: string-dedup
Log Message:

Rename some functions.


diffs (207 lines):

diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -14,30 +14,13 @@
 /* String Atom Implementation
  *
  * Strings are stored in two parts.  The first part is the normal tail
- * heap which contains a list of offsets.  The second part is the
- * theap which contains the actual strings.  The offsets in the tail
- * heap (a.k.a. offset heap) point into the theap (a.k.a. string
- * heap).  Strings are NULL-terminated and are stored without any
- * escape sequences.  Strings are encoded using the UTF-8 encoding
- * of Unicode.  This means that individual "characters" (really,
- * Unicode code points) can be between one and four bytes long.
- *
- * Because in many typical situations there are lots of duplicated
- * string values that are being stored in a table, but also in many
- * (other) typical situations there are very few duplicated string
- * values stored, a scheme has been introduced to cater to both
- * situations.
- *
- * When the string heap is "small" (defined as less than 64KiB), the
- * string heap is fully duplicate eliminated.  When the string heap
- * grows beyond this size, the heap is not kept free of duplicate
- * strings, but there is then a heuristic that tries to limit the
- * number of duplicates.
- *
- * This is done by having a fixed sized hash table at the start of the
- * string heap, and allocating space for collision lists in the first
- * 64KiB of the string heap.  After the first 64KiB no extra space is
- * allocated for lists, so hash collisions cannot be resolved.
+ * heap (theap) which contains a list of offsets.  The second part is
+ * the tvheap which contains the actual strings.  The offsets in the
+ * tail heap (a.k.a. offset heap) point into the tvheap (a.k.a. string
+ * heap).  Strings are NULL-terminated and are stored without any escape
+ * sequences.  Strings are encoded using the UTF-8 encoding of Unicode.
+ * This means that individual "characters" (really, Unicode code points)
+ * can be between one and four bytes long.
  */
 
 /* some of these macros are duplicates from gdk_atoms.c */
@@ -74,8 +57,10 @@ struct hash {
        Heap heap;
 };
 
+#define STRHASH_VERSION        1
+#define STRHASH_SYNCED (UINT64_C(1) << 24)
 #define EMPTY ((uint64_t) -1)
-#define OFFSET 0
+#define OFFSET 1
 #define PSLSHIFT 40
 #define PSLMASK ((UINT64_C(1) << PSLSHIFT) - 1)
 
@@ -90,7 +75,7 @@ hash_str(struct hash *h, const char *val
 }
 
 static inline var_t
-enter(BAT *b, const char *value)
+str_enter(BAT *b, const char *value)
 {
        size_t len = strlen(value) + 1;
        var_t pos = b->tvheap->free;
@@ -109,7 +94,7 @@ enter(BAT *b, const char *value)
 }
 
 static inline void
-reinsert(struct hash *h, BUN hsh, var_t pos)
+str_reinsert(struct hash *h, BUN hsh, var_t pos)
 {
        uint64_t psl = 0;
        uint64_t swp = EMPTY;
@@ -140,7 +125,7 @@ reinsert(struct hash *h, BUN hsh, var_t 
 }
 
 static inline void
-rmstring(struct hash *h, BUN pos)
+str_remove(struct hash *h, BUN pos)
 {
        h->buckets[pos] = EMPTY;
        for (;;) {
@@ -160,7 +145,7 @@ rmstring(struct hash *h, BUN pos)
 }
 
 static inline void
-incpsl(struct hash *h)
+str_incPSL(struct hash *h)
 {
        uint64_t hsh = 0;
        for (;;) {
@@ -177,7 +162,7 @@ incpsl(struct hash *h)
 }
 
 static inline gdk_return
-grow1(BAT *b, struct hash *h)
+str_growhash(BAT *b, struct hash *h)
 {
        BUN oldmask1 = h->mask1;
        BUN oldnbucket = h->nbucket;
@@ -196,8 +181,8 @@ grow1(BAT *b, struct hash *h)
        h->growlim = (BUN) (h->nbucket * (0.75 - (h->nbucket & h->mask1) / (2.0 
* h->mask2)));
        assert(h->mask1 < h->nbucket && h->nbucket <= h->mask2);
 
-       incpsl(h);
-       rmstring(h, h->nbucket - 1);
+       str_incPSL(h);
+       str_remove(h, h->nbucket - 1);
 
        BUN hsh = oldnbucket & oldmask1;
        uint64_t psl = 0;
@@ -215,8 +200,8 @@ grow1(BAT *b, struct hash *h)
                        BUN c = strHash(v);
                        if ((c & h->mask2) == oldnbucket) {
                                /* this one should be moved */
-                               rmstring(h, hsh);
-                               reinsert(h, oldnbucket, (var_t) bkt);
+                               str_remove(h, hsh);
+                               str_reinsert(h, oldnbucket, (var_t) bkt);
                                continue;
                        }
                }
@@ -228,7 +213,7 @@ grow1(BAT *b, struct hash *h)
 }
 
 static var_t
-insert(BAT *b, struct hash *h, const char *value)
+str_insert(BAT *b, struct hash *h, const char *value)
 {
        BUN hsh = hash_str(h, value);
        uint64_t psl = 0;       /* probe sequence length */
@@ -242,7 +227,7 @@ insert(BAT *b, struct hash *h, const cha
                        if (swp == EMPTY) {
                                assert(new == (var_t) -1);
                                /* didn't enter value yet, so do it now */
-                               new = enter(b, value);
+                               new = str_enter(b, value);
                                if (new == (var_t) -1)
                                        return (var_t) -1;
                                swp = (uint64_t) new;
@@ -251,7 +236,7 @@ insert(BAT *b, struct hash *h, const cha
                        assert(new != (var_t) -1);
                        h->buckets[hsh] = swp | psl << PSLSHIFT;
                        while (h->nentries >= h->growlim)
-                               if (grow1(b, h) != GDK_SUCCEED)
+                               if (str_growhash(b, h) != GDK_SUCCEED)
                                        return (var_t) -1;
                        return new;
                }
@@ -269,7 +254,7 @@ insert(BAT *b, struct hash *h, const cha
                if (psl > psl2) {
                        if (swp == EMPTY) {
                                assert(new == (var_t) -1);
-                               new = enter(b, value);
+                               new = str_enter(b, value);
                                if (new == (var_t) -1)
                                        return (var_t) -1;
                                swp = (uint64_t) new;
@@ -286,7 +271,7 @@ insert(BAT *b, struct hash *h, const cha
 }
 
 static struct hash *
-init(BAT *b)
+str_hashinit(BAT *b)
 {
        struct hash *hs = GDKmalloc(sizeof(struct hash));
        if (hs == NULL)
@@ -320,6 +305,7 @@ init(BAT *b)
                return NULL;
        }
        hs->heap.free = (hs->nbucket + OFFSET) * sizeof(uint64_t);
+       *(uint64_t *) hs->heap.base = STRHASH_VERSION;
        hs->buckets = (uint64_t *) hs->heap.base + OFFSET;
        for (BUN i = 0; i < minimum; i++)
                hs->buckets[i] = EMPTY;
@@ -328,10 +314,10 @@ init(BAT *b)
                const char *v = hp->base + pos;
                size_t len = strlen(v) + 1;
                BUN hsh = hash_str(hs, v);
-               reinsert(hs, hsh, pos);
+               str_reinsert(hs, hsh, pos);
                hs->nentries++;
                while (hs->nentries >= hs->growlim) {
-                       if (grow1(b, hs) != GDK_SUCCEED) {
+                       if (str_growhash(b, hs) != GDK_SUCCEED) {
                                HEAPfree(hp, true);
                                GDKfree(hs);
                                return NULL;
@@ -359,9 +345,9 @@ strCleanHash(Heap *h, bool rebuild)
 var_t
 strPut(BAT *b, var_t *dst, const void *v)
 {
-       if (b->strhash == NULL && (b->strhash = init(b)) == NULL)
+       if (b->strhash == NULL && (b->strhash = str_hashinit(b)) == NULL)
                return (var_t) -1;
-       var_t pos = insert(b, b->strhash, v);
+       var_t pos = str_insert(b, b->strhash, v);
        if (pos == (var_t) -1)
                return (var_t) -1;
        *dst = pos;
@@ -374,7 +360,7 @@ strLocate(BAT *b, const char *value)
        if (b->tvheap->parentid != b->batCacheid)
                return strLocate(BBP_cache(b->tvheap->parentid), value);
 
-       if (b->strhash == NULL && (b->strhash = init(b)) == NULL)
+       if (b->strhash == NULL && (b->strhash = str_hashinit(b)) == NULL)
                return (var_t) -1;
 
        struct hash *h = b->strhash;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to