Changeset: d593e5eda67f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d593e5eda67f
Modified Files:
gdk/gdk_batop.c
gdk/gdk_bbp.c
gdk/gdk_private.h
gdk/gdk_storage.c
gdk/gdk_string.c
testing/Mtest.py.in
Branch: string-dedup
Log Message:
Store probe sequence length in string hash buckets.
diffs (truncated from 480 to 300 lines):
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -2321,7 +2321,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
bn = BATproject(o, b);
if (bn == NULL)
goto error;
- if (bn->ttype == TYPE_void || isVIEW(bn)) {
+ if (bn->ttype == TYPE_void || bn->theap->parentid !=
bn->batCacheid) {
BAT *b2 = COLcopy(bn, ATOMtype(bn->ttype), true,
TRANSIENT);
BBPunfix(bn->batCacheid);
bn = b2;
@@ -2423,7 +2423,7 @@ BATsort(BAT **sorted, BAT **order, BAT *
assert(g->ttype == TYPE_oid);
grps = (oid *) Tloc(g, 0);
prev = grps[0];
- if (BATmaterialize(bn) != GDK_SUCCEED)
+ if (bn->ttype == TYPE_void && BATmaterialize(bn) != GDK_SUCCEED)
goto error;
for (r = 0, p = 1, q = BATcount(g); p < q; p++) {
if (grps[p] != prev) {
diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c
--- a/gdk/gdk_bbp.c
+++ b/gdk/gdk_bbp.c
@@ -339,6 +339,8 @@ BBPselectfarm(role_t role, int type, enu
if (hptype == orderidxheap)
role = TRANSIENT;
#endif
+ if (hptype == strhashheap)
+ role = TRANSIENT; /* for now */
for (i = 0; i < MAXFARMS; i++)
if (BBPfarms[i].roles & (1U << (int) role))
return i;
@@ -4035,6 +4037,9 @@ BBPdiskscan(const char *parent, size_t b
#else
delete = true;
#endif
+ } else if (strncmp(p + 1, "tstrhash", 8) == 0) {
+ /* string hash, we rebuild */
+ delete = true;
} else if (strncmp(p + 1, "new", 3) != 0) {
ok = false;
}
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -25,7 +25,8 @@ enum heaptype {
varheap,
hashheap,
imprintsheap,
- orderidxheap
+ orderidxheap,
+ strhashheap
};
gdk_return ATOMheap(int id, Heap *hp, size_t cap)
@@ -255,6 +256,8 @@ void settailname(Heap *restrict tail, co
__attribute__((__visibility__("hidden")));
void strCleanHash(Heap *hp, bool rebuild)
__attribute__((__visibility__("hidden")));
+void strDestroy(BAT *b)
+ __attribute__((__visibility__("hidden")));
gdk_return strHeap(Heap *d, size_t cap)
__attribute__((__visibility__("hidden")));
var_t strLocate(BAT *b, const char *v)
diff --git a/gdk/gdk_storage.c b/gdk/gdk_storage.c
--- a/gdk/gdk_storage.c
+++ b/gdk/gdk_storage.c
@@ -1010,6 +1010,8 @@ BATdelete(BAT *b)
} else {
HEAPfree(b->tvheap, true);
}
+ if (b->strhash)
+ strDestroy(b);
}
b->batCopiedtodisk = false;
}
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -70,17 +70,20 @@ struct hash {
BUN nbucket;
BUN nentries;
BUN growlim;
- var_t *buckets;
+ uint64_t *buckets;
Heap heap;
};
-#define EMPTY ((var_t) -1)
+#define EMPTY ((uint64_t) -1)
#define OFFSET 0
+#define PSLSHIFT 40
+#define PSLMASK ((UINT64_C(1) << PSLSHIFT) - 1)
static inline BUN
hash_str(struct hash *h, const char *value)
{
- BUN hsh = strHash(value) & h->mask2;
+ BUN hsh = strHash(value);
+ hsh &= h->mask2;
if (hsh >= h->nbucket)
hsh &= h->mask1;
return hsh;
@@ -97,38 +100,37 @@ enter(BAT *b, const char *value)
newsize += 64 * 1024;
} while (pos + len > newsize);
if (HEAPgrow(&b->theaplock, &b->tvheap, newsize, true) !=
GDK_SUCCEED)
- return EMPTY;
+ return (var_t) -1;
}
memcpy(b->tvheap->base + pos, value, len);
b->tvheap->free += len;
+ b->tvheap->dirty = true;
return pos;
}
static inline void
-reinsert(BAT *b, struct hash *h, BUN hsh, var_t pos)
+reinsert(struct hash *h, BUN hsh, var_t pos)
{
- BUN psl = 0;
- var_t swp = EMPTY;
+ uint64_t psl = 0;
+ uint64_t swp = EMPTY;
for (;;) {
- var_t bkt = h->buckets[hsh];
+ uint64_t bkt = h->buckets[hsh];
if (bkt == EMPTY) {
/* found an empty slot */
if (swp == EMPTY) {
- swp = pos;
+ swp = (uint64_t) pos;
}
- h->buckets[hsh] = swp;
+ h->buckets[hsh] = swp | psl << PSLSHIFT;
return;
}
- const char *v = b->tvheap->base + bkt;
- BUN hsh2 = hash_str(h, v);
- BUN psl2 = hsh < hsh2 ? hsh + h->nbucket - hsh2 : hsh - hsh2;
+ uint64_t psl2 = bkt >> PSLSHIFT;
if (psl > psl2) {
if (swp == EMPTY) {
- swp = pos;
+ swp = (uint64_t) pos;
}
- h->buckets[hsh] = swp;
- swp = bkt;
+ h->buckets[hsh] = swp | psl << PSLSHIFT;
+ swp = bkt & PSLMASK;
psl = psl2;
}
if (++hsh == h->nbucket)
@@ -138,60 +140,89 @@ reinsert(BAT *b, struct hash *h, BUN hsh
}
static inline void
-rmstring(BAT *b, struct hash *h, BUN pos)
+rmstring(struct hash *h, BUN pos)
{
h->buckets[pos] = EMPTY;
for (;;) {
BUN hsh = pos + 1;
if (hsh >= h->nbucket)
hsh = 0;
- var_t p = h->buckets[hsh];
- if (p == EMPTY || hash_str(h, b->tvheap->base + p) == hsh)
+ uint64_t p = h->buckets[hsh];
+ if (p == EMPTY)
break;
- h->buckets[pos] = p;
+ uint64_t psl = p >> PSLSHIFT;
+ if (psl == 0)
+ break;
+ h->buckets[pos] = (p & PSLMASK) | (psl - 1) << PSLSHIFT;
h->buckets[hsh] = EMPTY;
pos = hsh;
}
}
+static inline void
+incpsl(struct hash *h)
+{
+ uint64_t hsh = 0;
+ for (;;) {
+ uint64_t bkt = h->buckets[hsh];
+ if (bkt == EMPTY)
+ break;
+ uint64_t psl = bkt >> PSLSHIFT;
+ if (psl <= hsh)
+ break;
+ h->buckets[hsh] = (bkt & PSLMASK) | (psl + 1) << PSLSHIFT;
+ if (++hsh == h->nbucket)
+ break;
+ }
+}
+
static inline gdk_return
grow1(BAT *b, struct hash *h)
{
- struct hash oh = *h;
BUN oldmask1 = h->mask1;
BUN oldnbucket = h->nbucket;
- if ((h->nbucket + OFFSET) * sizeof(var_t) >= h->heap.size) {
+ if ((h->nbucket + OFFSET) * sizeof(uint64_t) >= h->heap.size) {
if (HEAPextend(&h->heap, h->heap.size + 64 * 1024, true) !=
GDK_SUCCEED)
return GDK_FAIL;
- h->buckets = (var_t *) h->heap.base + OFFSET;
+ h->buckets = (uint64_t *) h->heap.base + OFFSET;
}
- /* before increment so that we use old hash */
- rmstring(b, h, h->nbucket);
if (h->nbucket == h->mask2) {
h->mask1 = h->mask2;
h->mask2 |= h->mask2 << 1; /* extend with one bit */
}
h->nbucket++;
- h->heap.free = h->nbucket * sizeof(var_t);
- h->growlim = (h->nbucket * 3) / 4;
+ h->heap.free = (h->nbucket + OFFSET) * sizeof(uint64_t);
+ h->growlim = (BUN) (h->nbucket * (0.75 - (h->nbucket & h->mask1) / (2.0
* h->mask2)));
assert(h->mask1 < h->nbucket && h->nbucket <= h->mask2);
+ incpsl(h);
+ rmstring(h, h->nbucket - 1);
+
BUN hsh = oldnbucket & oldmask1;
+ uint64_t psl = 0;
for (;;) {
- BUN bkt = h->buckets[hsh];
+ uint64_t bkt = h->buckets[hsh];
if (bkt == EMPTY)
break;
- const char *v = b->tvheap->base + bkt;
- BUN c = strHash(v);
- if ((c & h->mask2) == oldnbucket) {
- /* this one should be moved */
- rmstring(b, &oh, hsh);
- reinsert(b, h, oldnbucket, bkt);
+ uint64_t psl2 = bkt >> PSLSHIFT;
+ if (psl2 < psl)
+ break;
+ if (psl2 == psl) {
+ bkt &= PSLMASK;
+ const char *v = b->tvheap->base + bkt;
+ BUN c = strHash(v);
+ if ((c & h->mask2) == oldnbucket) {
+ /* this one should be moved */
+ rmstring(h, hsh);
+ reinsert(h, oldnbucket, (var_t) bkt);
+ continue;
+ }
}
if (++hsh == oldnbucket)
hsh = 0;
+ psl++;
}
return GDK_SUCCEED;
}
@@ -200,56 +231,51 @@ static var_t
insert(BAT *b, struct hash *h, const char *value)
{
BUN hsh = hash_str(h, value);
- BUN psl = 0; /* probe sequence length */
- var_t swp = EMPTY; /* swapped entry */
- var_t new = EMPTY; /* new entry */
+ uint64_t psl = 0; /* probe sequence length */
+ uint64_t swp = EMPTY; /* swapped entry */
+ var_t new = (var_t) -1; /* new entry */
for (;;) {
- var_t bkt = h->buckets[hsh];
+ uint64_t bkt = h->buckets[hsh];
if (bkt == EMPTY) {
/* found an empty slot */
if (swp == EMPTY) {
- assert(new == EMPTY);
+ assert(new == (var_t) -1);
/* didn't enter value yet, so do it now */
- new = swp = enter(b, value);
- if (new == EMPTY)
+ new = enter(b, value);
+ if (new == (var_t) -1)
return (var_t) -1;
+ swp = (uint64_t) new;
h->nentries++;
- /* XXX maybe grow hash table (but first
- * write h->buckets[hsh]) */
- h->buckets[hsh] = swp;
- while (h->nentries == h->growlim)
- if (grow1(b, h) != GDK_SUCCEED)
- return (var_t) -1;
- } else {
- h->buckets[hsh] = swp;
}
+ assert(new != (var_t) -1);
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list