Changeset: 87653f83cbd5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/87653f83cbd5
Modified Files:
        gdk/gdk.h
        gdk/gdk_atoms.h
        gdk/gdk_bat.c
        gdk/gdk_batop.c
        gdk/gdk_bbp.c
        gdk/gdk_group.c
        gdk/gdk_heap.c
        gdk/gdk_private.h
        gdk/gdk_project.c
        gdk/gdk_select.c
        gdk/gdk_string.c
        monetdb5/extras/rapi/rapi.c
        sql/backends/monet5/UDF/pyapi3/conversion3.c
        sql/storage/bat/bat_storage.c
Branch: string-dedup
Log Message:

Implemented full string deduplication using Robin Hood hashing + linear hashing.


diffs (truncated from 1333 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -775,6 +775,7 @@ typedef struct BAT {
        MT_Lock theaplock;      /* lock protecting heap reference changes */
        MT_RWLock thashlock;    /* lock specifically for hash management */
        MT_Lock batIdxLock;     /* lock to manipulate other indexes/properties 
*/
+       struct hash *strhash;   /* hash structure for strings */
 } BAT;
 
 /* macros to hide complexity of the BAT structure */
@@ -1676,7 +1677,7 @@ tfastins_nocheckVAR(BAT *b, BUN p, const
        if ((rc = ATOMputVAR(b, &d, v)) != GDK_SUCCEED)
                return rc;
        if (b->twidth < SIZEOF_VAR_T &&
-           (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >= ((size_t) 1 << (8 << 
b->tshift))) {
+           d >= ((size_t) 1 << (8 << b->tshift))) {
                /* doesn't fit in current heap, upgrade it */
                rc = GDKupgradevarheap(b, d, 0, MAX(p, b->batCount));
                if (rc != GDK_SUCCEED)
@@ -1684,10 +1685,10 @@ tfastins_nocheckVAR(BAT *b, BUN p, const
        }
        switch (b->twidth) {
        case 1:
-               ((uint8_t *) b->theap->base)[p] = (uint8_t) (d - GDK_VAROFFSET);
+               ((uint8_t *) b->theap->base)[p] = (uint8_t) d;
                break;
        case 2:
-               ((uint16_t *) b->theap->base)[p] = (uint16_t) (d - 
GDK_VAROFFSET);
+               ((uint16_t *) b->theap->base)[p] = (uint16_t) d;
                break;
        case 4:
                ((uint32_t *) b->theap->base)[p] = (uint32_t) d;
diff --git a/gdk/gdk_atoms.h b/gdk/gdk_atoms.h
--- a/gdk/gdk_atoms.h
+++ b/gdk/gdk_atoms.h
@@ -379,11 +379,6 @@ ATOMreplaceVAR(BAT *b, var_t *dst, const
 #define GDK_STRHASHTABLE       (1<<10) /* 1024 */
 #define GDK_STRHASHMASK                (GDK_STRHASHTABLE-1)
 #define GDK_STRHASHSIZE                (GDK_STRHASHTABLE * sizeof(stridx_t))
-#define GDK_ELIMPOWER          16      /* 64KiB is the threshold */
-#define GDK_ELIMDOUBLES(h)     ((h)->free < GDK_ELIMLIMIT)
-#define GDK_ELIMLIMIT          (1<<GDK_ELIMPOWER)      /* equivalently: 
ELIMBASE == 0 */
-#define GDK_ELIMBASE(x)                (((x) >> GDK_ELIMPOWER) << 
GDK_ELIMPOWER)
-#define GDK_VAROFFSET          ((var_t) GDK_STRHASHSIZE)
 
 /*
  * @- String Comparison, NILs and UTF-8
@@ -432,9 +427,9 @@ VarHeapVal(const void *b, BUN p, int w)
 {
        switch (w) {
        case 1:
-               return (size_t) ((const uint8_t *) b)[p] + GDK_VAROFFSET;
+               return (size_t) ((const uint8_t *) b)[p];
        case 2:
-               return (size_t) ((const uint16_t *) b)[p] + GDK_VAROFFSET;
+               return (size_t) ((const uint16_t *) b)[p];
 #if SIZEOF_VAR_T == 8
        case 4:
                return (size_t) ((const uint32_t *) b)[p];
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -1434,10 +1434,10 @@ BUNinplacemulti(BAT *b, const oid *posit
                        _ptr = BUNtloc(bi, p);
                        switch (b->twidth) {
                        default:        /* only three or four cases possible */
-                               _d = (var_t) * (uint8_t *) _ptr + GDK_VAROFFSET;
+                               _d = (var_t) * (uint8_t *) _ptr;
                                break;
                        case 2:
-                               _d = (var_t) * (uint16_t *) _ptr + 
GDK_VAROFFSET;
+                               _d = (var_t) * (uint16_t *) _ptr;
                                break;
                        case 4:
                                _d = (var_t) * (uint32_t *) _ptr;
@@ -1453,7 +1453,7 @@ BUNinplacemulti(BAT *b, const oid *posit
                                return GDK_FAIL;
                        }
                        if (b->twidth < SIZEOF_VAR_T &&
-                           (b->twidth <= 2 ? _d - GDK_VAROFFSET : _d) >= 
((size_t) 1 << (8 << b->tshift))) {
+                           _d >= ((size_t) 1 << (8 << b->tshift))) {
                                /* doesn't fit in current heap, upgrade it */
                                if (GDKupgradevarheap(b, _d, 0, bi.count) != 
GDK_SUCCEED) {
                                        MT_rwlock_wrunlock(&b->thashlock);
@@ -1465,10 +1465,10 @@ BUNinplacemulti(BAT *b, const oid *posit
                        _ptr = BUNtloc(bi, p);
                        switch (b->twidth) {
                        default:        /* only three or four cases possible */
-                               * (uint8_t *) _ptr = (uint8_t) (_d - 
GDK_VAROFFSET);
+                               * (uint8_t *) _ptr = (uint8_t) _d;
                                break;
                        case 2:
-                               * (uint16_t *) _ptr = (uint16_t) (_d - 
GDK_VAROFFSET);
+                               * (uint16_t *) _ptr = (uint16_t) _d;
                                break;
                        case 4:
                                * (uint32_t *) _ptr = (uint32_t) _d;
@@ -2138,7 +2138,7 @@ HEAPcommitpersistence(Heap *hp, bool wri
 }
 
 
-#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str || GDK_ELIMDOUBLES(h))
+#define ATOMappendpriv(t, h) (ATOMstorage(t) != TYPE_str)
 
 /* change the heap modes at a commit */
 gdk_return
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -68,7 +68,6 @@ insert_string_bat(BAT *b, BAT *n, struct
        unsigned int tiv;       /* tail value-as-int */
 #endif
        var_t v;                /* value */
-       size_t off;             /* offset within n's string heap */
        BUN cnt = ci->ncand;
        BUN oldcnt = BATcount(b);
 
@@ -109,8 +108,7 @@ insert_string_bat(BAT *b, BAT *n, struct
                        bat_iterator_end(&ni);
                        return GDK_FAIL;
                }
-               if (oldcnt == 0 || (!GDK_ELIMDOUBLES(b->tvheap) &&
-                                   !GDK_ELIMDOUBLES(ni.vh))) {
+               if (oldcnt == 0) {
                        /* we'll consider copying the string heap completely
                         *
                         * we first estimate how much space the string heap
@@ -125,13 +123,7 @@ insert_string_bat(BAT *b, BAT *n, struct
                                len += strlen(BUNtvar(ni, p)) + 1;
                        }
                        len = (len + 512) / 1024; /* rounded average length */
-                       r = (GDK_ELIMLIMIT - GDK_STRHASHSIZE) / (len + 12);
-                       /* r is estimate of number of strings in
-                        * double-eliminated area */
-                       if (r < ci->ncand)
-                               len = GDK_ELIMLIMIT + (ci->ncand - r) * len;
-                       else
-                               len = GDK_STRHASHSIZE + ci->ncand * (len + 12);
+                       len = ci->ncand * len;
                        /* len is total estimated expected size of vheap */
 
                        if (len > ni.vh->free / 2) {
@@ -156,7 +148,7 @@ insert_string_bat(BAT *b, BAT *n, struct
        /* if toff has the initial value of ~0, we insert strings
         * individually, otherwise we only copy (insert) offsets */
        if (toff == ~(size_t) 0)
-               v = GDK_VAROFFSET;
+               v = 1;
        else
                v = b->tvheap->free - 1;
 
@@ -172,12 +164,11 @@ insert_string_bat(BAT *b, BAT *n, struct
                 * values, so we can use fast memcpy */
                MT_thread_setalgorithm("memcpy offsets");
                memcpy(Tloc(b, BUNlast(b)), (const char *) ni.base + ((ci->seq 
- n->hseqbase) << ni.shift), cnt << ni.shift);
-       } else if (toff != ~(size_t) 0) {
+       } else if (toff == 0) {
                /* we don't need to insert any actual strings since we
                 * have already made sure that they are all in b's
-                * string heap at known locations (namely the offset
-                * in n added to toff), so insert offsets from n after
-                * adding toff into b */
+                * string heap at known locations, so insert offsets
+                * from n into b */
                /* note the use of the "restrict" qualifier here: all
                 * four pointers below point to the same value, but
                 * only one of them will actually be used, hence we
@@ -219,10 +210,10 @@ insert_string_bat(BAT *b, BAT *n, struct
                        p = canditer_next(ci) - n->hseqbase;
                        switch (ni.width) {
                        case 1:
-                               v = (var_t) tbp[p] + GDK_VAROFFSET;
+                               v = (var_t) tbp[p];
                                break;
                        case 2:
-                               v = (var_t) tsp[p] + GDK_VAROFFSET;
+                               v = (var_t) tsp[p];
                                break;
 #if SIZEOF_VAR_T == 8
                        case 4:
@@ -233,17 +224,15 @@ insert_string_bat(BAT *b, BAT *n, struct
                                v = tvp[p];
                                break;
                        }
-                       v = (var_t) ((size_t) v + toff);
-                       assert(v >= GDK_VAROFFSET);
                        assert((size_t) v < b->tvheap->free);
                        switch (b->twidth) {
                        case 1:
-                               assert(v - GDK_VAROFFSET < ((var_t) 1 << 8));
-                               ((uint8_t *) b->theap->base)[r++] = (uint8_t) 
(v - GDK_VAROFFSET);
+                               assert(v < ((var_t) 1 << 8));
+                               ((uint8_t *) b->theap->base)[r++] = (uint8_t) v;
                                break;
                        case 2:
-                               assert(v - GDK_VAROFFSET < ((var_t) 1 << 16));
-                               ((uint16_t *) b->theap->base)[r++] = (uint16_t) 
(v - GDK_VAROFFSET);
+                               assert(v < ((var_t) 1 << 16));
+                               ((uint16_t *) b->theap->base)[r++] = (uint16_t) 
v;
                                break;
 #if SIZEOF_VAR_T == 8
                        case 4:
@@ -256,13 +245,9 @@ insert_string_bat(BAT *b, BAT *n, struct
                                break;
                        }
                }
-       } else if (b->tvheap->free < ni.vh->free / 2 ||
-                  GDK_ELIMDOUBLES(b->tvheap)) {
-               /* if b's string heap is much smaller than n's string
-                * heap, don't bother checking whether n's string
-                * values occur in b's string heap; also, if b is
-                * (still) fully double eliminated, we must continue
-                * to use the double elimination mechanism */
+       } else {
+               /* we must continue to use the double elimination
+                * mechanism by inserting individual strings */
                r = b->batCount;
                oid hseq = n->hseqbase;
                MT_thread_setalgorithm("insert string values");
@@ -276,55 +261,6 @@ insert_string_bat(BAT *b, BAT *n, struct
                        }
                        r++;
                }
-       } else {
-               /* Insert values from n individually into b; however,
-                * we check whether there is a string in b's string
-                * heap at the same offset as the string is in n's
-                * string heap (in case b's string heap is a copy of
-                * n's).  If this is the case, we just copy the
-                * offset, otherwise we insert normally.  */
-               r = b->batCount;
-               MT_thread_setalgorithm("insert string values with check");
-               while (cnt > 0) {
-                       cnt--;
-                       p = canditer_next(ci) - n->hseqbase;
-                       off = BUNtvaroff(ni, p); /* the offset */
-                       tp = ni.vh->base + off; /* the string */
-                       if (off < b->tvheap->free &&
-                           strcmp(b->tvheap->base + off, tp) == 0) {
-                               /* we found the string at the same
-                                * offset in b's string heap as it was
-                                * in n's string heap, so we don't
-                                * have to insert a new string into b:
-                                * we can just copy the offset */
-                               v = (var_t) off;
-                               switch (b->twidth) {
-                               case 1:
-                                       assert(v - GDK_VAROFFSET < ((var_t) 1 
<< 8));
-                                       ((uint8_t *) b->theap->base)[r] = 
(unsigned char) (v - GDK_VAROFFSET);
-                                       break;
-                               case 2:
-                                       assert(v - GDK_VAROFFSET < ((var_t) 1 
<< 16));
-                                       ((uint16_t *) b->theap->base)[r] = 
(unsigned short) (v - GDK_VAROFFSET);
-                                       break;
-#if SIZEOF_VAR_T == 8
-                               case 4:
-                                       assert(v < ((var_t) 1 << 32));
-                                       ((uint32_t *) b->theap->base)[r] = 
(unsigned int) v;
-                                       break;
-#endif
-                               default:
-                                       ((var_t *) b->theap->base)[r] = v;
-                                       break;
-                               }
-                       } else {
-                               if (tfastins_nocheckVAR(b, r, tp) != 
GDK_SUCCEED) {
-                                       bat_iterator_end(&ni);
-                                       return GDK_FAIL;
-                               }
-                       }
-                       r++;
-               }
        }
        BATsetcount(b, oldcnt + ci->ncand);
        bat_iterator_end(&ni);
@@ -1241,10 +1177,10 @@ BATappend_or_update(BAT *b, BAT *p, cons
                        var_t d;
                        switch (b->twidth) {
                        default: /* only three or four cases possible */
-                               d = (var_t) ((uint8_t *) b->theap->base)[updid] 
+ GDK_VAROFFSET;
+                               d = (var_t) ((uint8_t *) b->theap->base)[updid];
                                break;
                        case 2:
-                               d = (var_t) ((uint16_t *) 
b->theap->base)[updid] + GDK_VAROFFSET;
+                               d = (var_t) ((uint16_t *) 
b->theap->base)[updid];
                                break;
                        case 4:
                                d = (var_t) ((uint32_t *) 
b->theap->base)[updid];
@@ -1259,7 +1195,7 @@ BATappend_or_update(BAT *b, BAT *p, cons
                                goto bailout;
                        }
                        if (b->twidth < SIZEOF_VAR_T &&
-                           (b->twidth <= 2 ? d - GDK_VAROFFSET : d) >= 
((size_t) 1 << (8 << b->tshift))) {
+                           d >= ((size_t) 1 << (8 << b->tshift))) {
                                /* doesn't fit in current heap, upgrade it */
                                if (GDKupgradevarheap(b, d, 0, MAX(updid, 
b->batCount)) != GDK_SUCCEED) {
                                        goto bailout;
@@ -1271,10 +1207,10 @@ BATappend_or_update(BAT *b, BAT *p, cons
                        bi = bat_iterator_nolock(b);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to