Changeset: d593e5eda67f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d593e5eda67f
Modified Files:
        gdk/gdk_batop.c
        gdk/gdk_bbp.c
        gdk/gdk_private.h
        gdk/gdk_storage.c
        gdk/gdk_string.c
        testing/Mtest.py.in
Branch: string-dedup
Log Message:

Store probe sequence length in string hash buckets.


diffs (truncated from 480 to 300 lines):

diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -2321,7 +2321,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
                bn = BATproject(o, b);
                if (bn == NULL)
                        goto error;
-               if (bn->ttype == TYPE_void || isVIEW(bn)) {
+               if (bn->ttype == TYPE_void || bn->theap->parentid != 
bn->batCacheid) {
                        BAT *b2 = COLcopy(bn, ATOMtype(bn->ttype), true, 
TRANSIENT);
                        BBPunfix(bn->batCacheid);
                        bn = b2;
@@ -2423,7 +2423,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
                assert(g->ttype == TYPE_oid);
                grps = (oid *) Tloc(g, 0);
                prev = grps[0];
-               if (BATmaterialize(bn) != GDK_SUCCEED)
+               if (bn->ttype == TYPE_void && BATmaterialize(bn) != GDK_SUCCEED)
                        goto error;
                for (r = 0, p = 1, q = BATcount(g); p < q; p++) {
                        if (grps[p] != prev) {
diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c
--- a/gdk/gdk_bbp.c
+++ b/gdk/gdk_bbp.c
@@ -339,6 +339,8 @@ BBPselectfarm(role_t role, int type, enu
        if (hptype == orderidxheap)
                role = TRANSIENT;
 #endif
+       if (hptype == strhashheap)
+               role = TRANSIENT; /* for now */
        for (i = 0; i < MAXFARMS; i++)
                if (BBPfarms[i].roles & (1U << (int) role))
                        return i;
@@ -4035,6 +4037,9 @@ BBPdiskscan(const char *parent, size_t b
 #else
                                delete = true;
 #endif
+                       } else if (strncmp(p + 1, "tstrhash", 8) == 0) {
+                               /* string hash, we rebuild */
+                               delete = true;
                        } else if (strncmp(p + 1, "new", 3) != 0) {
                                ok = false;
                        }
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -25,7 +25,8 @@ enum heaptype {
        varheap,
        hashheap,
        imprintsheap,
-       orderidxheap
+       orderidxheap,
+       strhashheap
 };
 
 gdk_return ATOMheap(int id, Heap *hp, size_t cap)
@@ -255,6 +256,8 @@ void settailname(Heap *restrict tail, co
        __attribute__((__visibility__("hidden")));
 void strCleanHash(Heap *hp, bool rebuild)
        __attribute__((__visibility__("hidden")));
+void strDestroy(BAT *b)
+       __attribute__((__visibility__("hidden")));
 gdk_return strHeap(Heap *d, size_t cap)
        __attribute__((__visibility__("hidden")));
 var_t strLocate(BAT *b, const char *v)
diff --git a/gdk/gdk_storage.c b/gdk/gdk_storage.c
--- a/gdk/gdk_storage.c
+++ b/gdk/gdk_storage.c
@@ -1010,6 +1010,8 @@ BATdelete(BAT *b)
                } else {
                        HEAPfree(b->tvheap, true);
                }
+               if (b->strhash)
+                       strDestroy(b);
        }
        b->batCopiedtodisk = false;
 }
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -70,17 +70,20 @@ struct hash {
        BUN nbucket;
        BUN nentries;
        BUN growlim;
-       var_t *buckets;
+       uint64_t *buckets;
        Heap heap;
 };
 
-#define EMPTY ((var_t) -1)
+#define EMPTY ((uint64_t) -1)
 #define OFFSET 0
+#define PSLSHIFT 40
+#define PSLMASK ((UINT64_C(1) << PSLSHIFT) - 1)
 
 static inline BUN
 hash_str(struct hash *h, const char *value)
 {
-       BUN hsh = strHash(value) & h->mask2;
+       BUN hsh = strHash(value);
+       hsh &= h->mask2;
        if (hsh >= h->nbucket)
                hsh &= h->mask1;
        return hsh;
@@ -97,38 +100,37 @@ enter(BAT *b, const char *value)
                        newsize += 64 * 1024;
                } while (pos + len > newsize);
                if (HEAPgrow(&b->theaplock, &b->tvheap, newsize, true) != 
GDK_SUCCEED)
-                       return EMPTY;
+                       return (var_t) -1;
        }
        memcpy(b->tvheap->base + pos, value, len);
        b->tvheap->free += len;
+       b->tvheap->dirty = true;
        return pos;
 }
 
 static inline void
-reinsert(BAT *b, struct hash *h, BUN hsh, var_t pos)
+reinsert(struct hash *h, BUN hsh, var_t pos)
 {
-       BUN psl = 0;
-       var_t swp = EMPTY;
+       uint64_t psl = 0;
+       uint64_t swp = EMPTY;
 
        for (;;) {
-               var_t bkt = h->buckets[hsh];
+               uint64_t bkt = h->buckets[hsh];
                if (bkt == EMPTY) {
                        /* found an empty slot */
                        if (swp == EMPTY) {
-                               swp = pos;
+                               swp = (uint64_t) pos;
                        }
-                       h->buckets[hsh] = swp;
+                       h->buckets[hsh] = swp | psl << PSLSHIFT;
                        return;
                }
-               const char *v = b->tvheap->base + bkt;
-               BUN hsh2 = hash_str(h, v);
-               BUN psl2 = hsh < hsh2 ? hsh + h->nbucket - hsh2 : hsh - hsh2;
+               uint64_t psl2 = bkt >> PSLSHIFT;
                if (psl > psl2) {
                        if (swp == EMPTY) {
-                               swp = pos;
+                               swp = (uint64_t) pos;
                        }
-                       h->buckets[hsh] = swp;
-                       swp = bkt;
+                       h->buckets[hsh] = swp | psl << PSLSHIFT;
+                       swp = bkt & PSLMASK;
                        psl = psl2;
                }
                if (++hsh == h->nbucket)
@@ -138,60 +140,89 @@ reinsert(BAT *b, struct hash *h, BUN hsh
 }
 
 static inline void
-rmstring(BAT *b, struct hash *h, BUN pos)
+rmstring(struct hash *h, BUN pos)
 {
        h->buckets[pos] = EMPTY;
        for (;;) {
                BUN hsh = pos + 1;
                if (hsh >= h->nbucket)
                        hsh = 0;
-               var_t p = h->buckets[hsh];
-               if (p == EMPTY || hash_str(h, b->tvheap->base + p) == hsh)
+               uint64_t p = h->buckets[hsh];
+               if (p == EMPTY)
                        break;
-               h->buckets[pos] = p;
+               uint64_t psl = p >> PSLSHIFT;
+               if (psl == 0)
+                       break;
+               h->buckets[pos] = (p & PSLMASK) | (psl - 1) << PSLSHIFT;
                h->buckets[hsh] = EMPTY;
                pos = hsh;
        }
 }
 
+static inline void
+incpsl(struct hash *h)
+{
+       uint64_t hsh = 0;
+       for (;;) {
+               uint64_t bkt = h->buckets[hsh];
+               if (bkt == EMPTY)
+                       break;
+               uint64_t psl = bkt >> PSLSHIFT;
+               if (psl <= hsh)
+                       break;
+               h->buckets[hsh] = (bkt & PSLMASK) | (psl + 1) << PSLSHIFT;
+               if (++hsh == h->nbucket)
+                       break;
+       }
+}
+
 static inline gdk_return
 grow1(BAT *b, struct hash *h)
 {
-       struct hash oh = *h;
        BUN oldmask1 = h->mask1;
        BUN oldnbucket = h->nbucket;
 
-       if ((h->nbucket + OFFSET) * sizeof(var_t) >= h->heap.size) {
+       if ((h->nbucket + OFFSET) * sizeof(uint64_t) >= h->heap.size) {
                if (HEAPextend(&h->heap, h->heap.size + 64 * 1024, true) != 
GDK_SUCCEED)
                        return GDK_FAIL;
-               h->buckets = (var_t *) h->heap.base + OFFSET;
+               h->buckets = (uint64_t *) h->heap.base + OFFSET;
        }
-       /* before increment so that we use old hash */
-       rmstring(b, h, h->nbucket);
        if (h->nbucket == h->mask2) {
                h->mask1 = h->mask2;
                h->mask2 |= h->mask2 << 1; /* extend with one bit */
        }
        h->nbucket++;
-       h->heap.free = h->nbucket * sizeof(var_t);
-       h->growlim = (h->nbucket * 3) / 4;
+       h->heap.free = (h->nbucket + OFFSET) * sizeof(uint64_t);
+       h->growlim = (BUN) (h->nbucket * (0.75 - (h->nbucket & h->mask1) / (2.0 
* h->mask2)));
        assert(h->mask1 < h->nbucket && h->nbucket <= h->mask2);
 
+       incpsl(h);
+       rmstring(h, h->nbucket - 1);
+
        BUN hsh = oldnbucket & oldmask1;
+       uint64_t psl = 0;
 
        for (;;) {
-               BUN bkt = h->buckets[hsh];
+               uint64_t bkt = h->buckets[hsh];
                if (bkt == EMPTY)
                        break;
-               const char *v = b->tvheap->base + bkt;
-               BUN c = strHash(v);
-               if ((c & h->mask2) == oldnbucket) {
-                       /* this one should be moved */
-                       rmstring(b, &oh, hsh);
-                       reinsert(b, h, oldnbucket, bkt);
+               uint64_t psl2 = bkt >> PSLSHIFT;
+               if (psl2 < psl)
+                       break;
+               if (psl2 == psl) {
+                       bkt &= PSLMASK;
+                       const char *v = b->tvheap->base + bkt;
+                       BUN c = strHash(v);
+                       if ((c & h->mask2) == oldnbucket) {
+                               /* this one should be moved */
+                               rmstring(h, hsh);
+                               reinsert(h, oldnbucket, (var_t) bkt);
+                               continue;
+                       }
                }
                if (++hsh == oldnbucket)
                        hsh = 0;
+               psl++;
        }
        return GDK_SUCCEED;
 }
@@ -200,56 +231,51 @@ static var_t
 insert(BAT *b, struct hash *h, const char *value)
 {
        BUN hsh = hash_str(h, value);
-       BUN psl = 0;    /* probe sequence length */
-       var_t swp = EMPTY;      /* swapped entry */
-       var_t new = EMPTY;      /* new entry */
+       uint64_t psl = 0;       /* probe sequence length */
+       uint64_t swp = EMPTY;   /* swapped entry */
+       var_t new = (var_t) -1; /* new entry */
 
        for (;;) {
-               var_t bkt = h->buckets[hsh];
+               uint64_t bkt = h->buckets[hsh];
                if (bkt == EMPTY) {
                        /* found an empty slot */
                        if (swp == EMPTY) {
-                               assert(new == EMPTY);
+                               assert(new == (var_t) -1);
                                /* didn't enter value yet, so do it now */
-                               new = swp = enter(b, value);
-                               if (new == EMPTY)
+                               new = enter(b, value);
+                               if (new == (var_t) -1)
                                        return (var_t) -1;
+                               swp = (uint64_t) new;
                                h->nentries++;
-                               /* XXX maybe grow hash table (but first
-                                * write h->buckets[hsh]) */
-                               h->buckets[hsh] = swp;
-                               while (h->nentries == h->growlim)
-                                       if (grow1(b, h) != GDK_SUCCEED)
-                                               return (var_t) -1;
-                       } else {
-                               h->buckets[hsh] = swp;
                        }
+                       assert(new != (var_t) -1);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to