Changeset: 87653f83cbd5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/87653f83cbd5
Modified Files:
gdk/gdk.h
gdk/gdk_atoms.h
gdk/gdk_bat.c
gdk/gdk_batop.c
gdk/gdk_bbp.c
gdk/gdk_group.c
gdk/gdk_heap.c
gdk/gdk_private.h
gdk/gdk_project.c
gdk/gdk_select.c
gdk/gdk_string.c
monetdb5/extras/rapi/rapi.c
sql/backends/monet5/UDF/pyapi3/conversion3.c
sql/storage/bat/bat_storage.c
Branch: string-dedup
Log Message:
Implemented full string deduplication using Robin Hood hashing + linear hashing.
diffs (truncated from 1333 to 300 lines):
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -775,6 +775,7 @@ typedef struct BAT {
MT_Lock theaplock; /* lock protecting heap reference changes */
MT_RWLock thashlock; /* lock specifically for hash management */
MT_Lock batIdxLock; /* lock to manipulate other indexes/properties
*/
+ struct hash *strhash; /* hash structure for strings */
} BAT;
/* macros to hide complexity of the BAT structure */
@@ -1676,7 +1677,7 @@ tfastins_nocheckVAR(BAT *b, BUN p, const
if ((rc = ATOMputVAR(b, &d, v)) != GDK_SUCCEED)
return rc;
if (b->twidth < SIZEOF_VAR_T &&
- (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >= ((size_t) 1 << (8 <<
b->tshift))) {
+ d >= ((size_t) 1 << (8 << b->tshift))) {
/* doesn't fit in current heap, upgrade it */
rc = GDKupgradevarheap(b, d, 0, MAX(p, b->batCount));
if (rc != GDK_SUCCEED)
@@ -1684,10 +1685,10 @@ tfastins_nocheckVAR(BAT *b, BUN p, const
}
switch (b->twidth) {
case 1:
- ((uint8_t *) b->theap->base)[p] = (uint8_t) (d - GDK_VAROFFSET);
+ ((uint8_t *) b->theap->base)[p] = (uint8_t) d;
break;
case 2:
- ((uint16_t *) b->theap->base)[p] = (uint16_t) (d -
GDK_VAROFFSET);
+ ((uint16_t *) b->theap->base)[p] = (uint16_t) d;
break;
case 4:
((uint32_t *) b->theap->base)[p] = (uint32_t) d;
diff --git a/gdk/gdk_atoms.h b/gdk/gdk_atoms.h
--- a/gdk/gdk_atoms.h
+++ b/gdk/gdk_atoms.h
@@ -379,11 +379,6 @@ ATOMreplaceVAR(BAT *b, var_t *dst, const
#define GDK_STRHASHTABLE (1<<10) /* 1024 */
#define GDK_STRHASHMASK (GDK_STRHASHTABLE-1)
#define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t))
-#define GDK_ELIMPOWER 16 /* 64KiB is the threshold */
-#define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT)
-#define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently:
ELIMBASE == 0 */
-#define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) <<
GDK_ELIMPOWER)
-#define GDK_VAROFFSET ((var_t) GDK_STRHASHSIZE)
/*
* @- String Comparison, NILs and UTF-8
@@ -432,9 +427,9 @@ VarHeapVal(const void *b, BUN p, int w)
{
switch (w) {
case 1:
- return (size_t) ((const uint8_t *) b)[p] + GDK_VAROFFSET;
+ return (size_t) ((const uint8_t *) b)[p];
case 2:
- return (size_t) ((const uint16_t *) b)[p] + GDK_VAROFFSET;
+ return (size_t) ((const uint16_t *) b)[p];
#if SIZEOF_VAR_T == 8
case 4:
return (size_t) ((const uint32_t *) b)[p];
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -1434,10 +1434,10 @@ BUNinplacemulti(BAT *b, const oid *posit
_ptr = BUNtloc(bi, p);
switch (b->twidth) {
default: /* only three or four cases possible */
- _d = (var_t) * (uint8_t *) _ptr + GDK_VAROFFSET;
+ _d = (var_t) * (uint8_t *) _ptr;
break;
case 2:
- _d = (var_t) * (uint16_t *) _ptr +
GDK_VAROFFSET;
+ _d = (var_t) * (uint16_t *) _ptr;
break;
case 4:
_d = (var_t) * (uint32_t *) _ptr;
@@ -1453,7 +1453,7 @@ BUNinplacemulti(BAT *b, const oid *posit
return GDK_FAIL;
}
if (b->twidth < SIZEOF_VAR_T &&
- (b->twidth <= 2 ? _d - GDK_VAROFFSET : _d) >=
((size_t) 1 << (8 << b->tshift))) {
+ _d >= ((size_t) 1 << (8 << b->tshift))) {
/* doesn't fit in current heap, upgrade it */
if (GDKupgradevarheap(b, _d, 0, bi.count) !=
GDK_SUCCEED) {
MT_rwlock_wrunlock(&b->thashlock);
@@ -1465,10 +1465,10 @@ BUNinplacemulti(BAT *b, const oid *posit
_ptr = BUNtloc(bi, p);
switch (b->twidth) {
default: /* only three or four cases possible */
- * (uint8_t *) _ptr = (uint8_t) (_d -
GDK_VAROFFSET);
+ * (uint8_t *) _ptr = (uint8_t) _d;
break;
case 2:
- * (uint16_t *) _ptr = (uint16_t) (_d -
GDK_VAROFFSET);
+ * (uint16_t *) _ptr = (uint16_t) _d;
break;
case 4:
* (uint32_t *) _ptr = (uint32_t) _d;
@@ -2138,7 +2138,7 @@ HEAPcommitpersistence(Heap *hp, bool wri
}
-#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str || GDK_ELIMDOUBLES(h))
+#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str)
/* change the heap modes at a commit */
gdk_return
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -68,7 +68,6 @@ insert_string_bat(BAT *b, BAT *n, struct
unsigned int tiv; /* tail value-as-int */
#endif
var_t v; /* value */
- size_t off; /* offset within n's string heap */
BUN cnt = ci->ncand;
BUN oldcnt = BATcount(b);
@@ -109,8 +108,7 @@ insert_string_bat(BAT *b, BAT *n, struct
bat_iterator_end(&ni);
return GDK_FAIL;
}
- if (oldcnt == 0 || (!GDK_ELIMDOUBLES(b->tvheap) &&
- !GDK_ELIMDOUBLES(ni.vh))) {
+ if (oldcnt == 0) {
/* we'll consider copying the string heap completely
*
* we first estimate how much space the string heap
@@ -125,13 +123,7 @@ insert_string_bat(BAT *b, BAT *n, struct
len += strlen(BUNtvar(ni, p)) + 1;
}
len = (len + 512) / 1024; /* rounded average length */
- r = (GDK_ELIMLIMIT - GDK_STRHASHSIZE) / (len + 12);
- /* r is estimate of number of strings in
- * double-eliminated area */
- if (r < ci->ncand)
- len = GDK_ELIMLIMIT + (ci->ncand - r) * len;
- else
- len = GDK_STRHASHSIZE + ci->ncand * (len + 12);
+ len = ci->ncand * len;
/* len is total estimated expected size of vheap */
if (len > ni.vh->free / 2) {
@@ -156,7 +148,7 @@ insert_string_bat(BAT *b, BAT *n, struct
/* if toff has the initial value of ~0, we insert strings
* individually, otherwise we only copy (insert) offsets */
if (toff == ~(size_t) 0)
- v = GDK_VAROFFSET;
+ v = 1;
else
v = b->tvheap->free - 1;
@@ -172,12 +164,11 @@ insert_string_bat(BAT *b, BAT *n, struct
* values, so we can use fast memcpy */
MT_thread_setalgorithm("memcpy offsets");
memcpy(Tloc(b, BUNlast(b)), (const char *) ni.base + ((ci->seq
- n->hseqbase) << ni.shift), cnt << ni.shift);
- } else if (toff != ~(size_t) 0) {
+ } else if (toff == 0) {
/* we don't need to insert any actual strings since we
* have already made sure that they are all in b's
- * string heap at known locations (namely the offset
- * in n added to toff), so insert offsets from n after
- * adding toff into b */
+ * string heap at known locations, so insert offsets
+ * from n into b */
/* note the use of the "restrict" qualifier here: all
* four pointers below point to the same value, but
* only one of them will actually be used, hence we
@@ -219,10 +210,10 @@ insert_string_bat(BAT *b, BAT *n, struct
p = canditer_next(ci) - n->hseqbase;
switch (ni.width) {
case 1:
- v = (var_t) tbp[p] + GDK_VAROFFSET;
+ v = (var_t) tbp[p];
break;
case 2:
- v = (var_t) tsp[p] + GDK_VAROFFSET;
+ v = (var_t) tsp[p];
break;
#if SIZEOF_VAR_T == 8
case 4:
@@ -233,17 +224,15 @@ insert_string_bat(BAT *b, BAT *n, struct
v = tvp[p];
break;
}
- v = (var_t) ((size_t) v + toff);
- assert(v >= GDK_VAROFFSET);
assert((size_t) v < b->tvheap->free);
switch (b->twidth) {
case 1:
- assert(v - GDK_VAROFFSET < ((var_t) 1 << 8));
- ((uint8_t *) b->theap->base)[r++] = (uint8_t)
(v - GDK_VAROFFSET);
+ assert(v < ((var_t) 1 << 8));
+ ((uint8_t *) b->theap->base)[r++] = (uint8_t) v;
break;
case 2:
- assert(v - GDK_VAROFFSET < ((var_t) 1 << 16));
- ((uint16_t *) b->theap->base)[r++] = (uint16_t)
(v - GDK_VAROFFSET);
+ assert(v < ((var_t) 1 << 16));
+ ((uint16_t *) b->theap->base)[r++] = (uint16_t)
v;
break;
#if SIZEOF_VAR_T == 8
case 4:
@@ -256,13 +245,9 @@ insert_string_bat(BAT *b, BAT *n, struct
break;
}
}
- } else if (b->tvheap->free < ni.vh->free / 2 ||
- GDK_ELIMDOUBLES(b->tvheap)) {
- /* if b's string heap is much smaller than n's string
- * heap, don't bother checking whether n's string
- * values occur in b's string heap; also, if b is
- * (still) fully double eliminated, we must continue
- * to use the double elimination mechanism */
+ } else {
+ /* we must continue to use the double elimination
+ * mechanism by inserting individual strings */
r = b->batCount;
oid hseq = n->hseqbase;
MT_thread_setalgorithm("insert string values");
@@ -276,55 +261,6 @@ insert_string_bat(BAT *b, BAT *n, struct
}
r++;
}
- } else {
- /* Insert values from n individually into b; however,
- * we check whether there is a string in b's string
- * heap at the same offset as the string is in n's
- * string heap (in case b's string heap is a copy of
- * n's). If this is the case, we just copy the
- * offset, otherwise we insert normally. */
- r = b->batCount;
- MT_thread_setalgorithm("insert string values with check");
- while (cnt > 0) {
- cnt--;
- p = canditer_next(ci) - n->hseqbase;
- off = BUNtvaroff(ni, p); /* the offset */
- tp = ni.vh->base + off; /* the string */
- if (off < b->tvheap->free &&
- strcmp(b->tvheap->base + off, tp) == 0) {
- /* we found the string at the same
- * offset in b's string heap as it was
- * in n's string heap, so we don't
- * have to insert a new string into b:
- * we can just copy the offset */
- v = (var_t) off;
- switch (b->twidth) {
- case 1:
- assert(v - GDK_VAROFFSET < ((var_t) 1
<< 8));
- ((uint8_t *) b->theap->base)[r] =
(unsigned char) (v - GDK_VAROFFSET);
- break;
- case 2:
- assert(v - GDK_VAROFFSET < ((var_t) 1
<< 16));
- ((uint16_t *) b->theap->base)[r] =
(unsigned short) (v - GDK_VAROFFSET);
- break;
-#if SIZEOF_VAR_T == 8
- case 4:
- assert(v < ((var_t) 1 << 32));
- ((uint32_t *) b->theap->base)[r] =
(unsigned int) v;
- break;
-#endif
- default:
- ((var_t *) b->theap->base)[r] = v;
- break;
- }
- } else {
- if (tfastins_nocheckVAR(b, r, tp) !=
GDK_SUCCEED) {
- bat_iterator_end(&ni);
- return GDK_FAIL;
- }
- }
- r++;
- }
}
BATsetcount(b, oldcnt + ci->ncand);
bat_iterator_end(&ni);
@@ -1241,10 +1177,10 @@ BATappend_or_update(BAT *b, BAT *p, cons
var_t d;
switch (b->twidth) {
default: /* only three or four cases possible */
- d = (var_t) ((uint8_t *) b->theap->base)[updid]
+ GDK_VAROFFSET;
+ d = (var_t) ((uint8_t *) b->theap->base)[updid];
break;
case 2:
- d = (var_t) ((uint16_t *)
b->theap->base)[updid] + GDK_VAROFFSET;
+ d = (var_t) ((uint16_t *)
b->theap->base)[updid];
break;
case 4:
d = (var_t) ((uint32_t *)
b->theap->base)[updid];
@@ -1259,7 +1195,7 @@ BATappend_or_update(BAT *b, BAT *p, cons
goto bailout;
}
if (b->twidth < SIZEOF_VAR_T &&
- (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >=
((size_t) 1 << (8 << b->tshift))) {
+ d >= ((size_t) 1 << (8 << b->tshift))) {
/* doesn't fit in current heap, upgrade it */
if (GDKupgradevarheap(b, d, 0, MAX(updid,
b->batCount)) != GDK_SUCCEED) {
goto bailout;
@@ -1271,10 +1207,10 @@ BATappend_or_update(BAT *b, BAT *p, cons
bi = bat_iterator_nolock(b);
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list