Changeset: 87379087770d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=87379087770d
Modified Files:
        gdk/gdk_bat.c
        gdk/gdk_delta.c
        gdk/gdk_group.c
        gdk/gdk_unique.c
Branch: Oct2014
Log Message:

In BATsubunique and BATgroup make better use of the hash links.
Our hash links have the property that they go from high to low indexes
in a BAT.  Also, we can very easily jump into the middle of a chain
when we already know the position a value has in the BAT.  Use this
knowledge in BATsubunique and BATgroup.  We now also create a hash
when the BAT being examined is persistent, and we check whether the
parent of a view already has a hash (we don't build a new hash in that
case).
In order to make sure that our hash links keep going from high to low,
updating the hash has been removed when deleting or replacing entries
in a BAT.  Instead we just destroy the hash table in that case.  Note
that append can still update the hash.


diffs (truncated from 525 to 300 lines):

diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -502,12 +502,10 @@ BATclear(BAT *b, int force)
 {
        BUN p, q;
        int voidbat;
-       BAT *bm;
 
        BATcheck(b, "BATclear");
 
        voidbat = 0;
-       bm = BATmirror(b);
 
        if (BAThdense(b) && b->htype == TYPE_void) {
                voidbat = 1;
@@ -525,12 +523,7 @@ BATclear(BAT *b, int force)
        }
 
        /* kill all search accelerators */
-       if (b->H->hash) {
-               HASHremove(b);
-       }
-       if (b->T->hash) {
-               HASHremove(bm);
-       }
+       HASHdestroy(b);
        IMPSdestroy(b);
 
        /* we must dispose of all inserted atoms */
@@ -1006,19 +999,7 @@ BATcopy(BAT *b, int ht, int tt, int writ
                                *_dst++ = *_src++;                      \
                }                                                       \
        } while (0)
-#define hacc_update(func, get, p, idx)                                 \
-       do {                                                            \
-               if (b->H->hash) {                                       \
-                       func(b->H->hash, idx, get(bi, p), p < last);    \
-               }                                                       \
-       } while (0)
-#define tacc_update(func, get, p, idx)                                 \
-       do {                                                            \
-               if (b->T->hash) {                                       \
-                       func(b->T->hash, idx, get(bi, p), p < last);    \
-               }                                                       \
-       } while (0)
-#define acc_move(l, p, idx2, idx1)                                     \
+#define acc_move(l, p)                                                 \
        do {                                                            \
                char tmp[16];                                           \
                /* avoid compiler warning: dereferencing type-punned pointer \
@@ -1027,12 +1008,6 @@ BATcopy(BAT *b, int ht, int tt, int writ
                                                                        \
                assert(hs <= 16);                                       \
                assert(ts <= 16);                                       \
-               if (b->H->hash) {                                       \
-                       HASHmove(b->H->hash, idx2, idx1, BUNhead(bi, l), l < 
last); \
-               }                                                       \
-               if (b->T->hash) {                                       \
-                       HASHmove(b->T->hash, idx2, idx1, BUNtail(bi, l), l < 
last); \
-               }                                                       \
                                                                        \
                /* move first to tmp */                                 \
                un_move(Hloc(b, l), tmpp, hs);                          \
@@ -1196,14 +1171,13 @@ BUNins(BAT *b, const void *h, const void
                if (BUNinplace(bm, p, t, h, force) == NULL)
                        return NULL;
        } else {
-               size_t hsize = 0, tsize = 0;
-
                p = BUNlast(b); /* insert at end */
                if (p == BUN_MAX || b->batCount == BUN_MAX) {
                        GDKerror("BUNins: bat too large\n");
                        return NULL;
                }
 
+               HASHdestroy(b);
                if (unshare_string_heap(b) == GDK_FAIL) {
                        GDKerror("BUNins: failed to unshare string heap\n");
                        return NULL;
@@ -1211,10 +1185,6 @@ BUNins(BAT *b, const void *h, const void
 
                ALIGNins(b, "BUNins", force);
                b->batDirty = 1;
-               if (b->H->hash && b->H->vheap)
-                       hsize = b->H->vheap->size;
-               if (b->T->hash && b->T->vheap)
-                       tsize = b->T->vheap->size;
 
                setcolprops(b, b->H, h);
                setcolprops(b, b->T, t);
@@ -1224,18 +1194,6 @@ BUNins(BAT *b, const void *h, const void
                } else {
                        BATsetcount(b, b->batCount + 1);
                }
-
-               if (b->H->hash) {
-                       HASHins(b, p, h);
-                       if (hsize && hsize != b->H->vheap->size)
-                               HEAPwarm(b->H->vheap);
-               }
-               if (b->T->hash) {
-                       HASHins(bm, p, t);
-
-                       if (tsize && tsize != b->T->vheap->size)
-                               HEAPwarm(b->T->vheap);
-               }
        }
        IMPSdestroy(b); /* no support for inserts in imprints yet */
        return b;
@@ -1369,16 +1327,13 @@ BUNappend(BAT *b, const void *t, bit for
  * one now must do:
  *     BATloopDEL(b,p) p = BUNdelete(b,p,FALSE)
  */
-#define hashins(h,i,v,n) HASHins_any(h,i,v)
-#define hashdel(h,i,v,n) HASHdel(h,i,v,n)
-
 static inline BUN
 BUNdelete_(BAT *b, BUN p, bit force)
 {
        BATiter bi = bat_iterator(b);
        BAT *bm = BBP_cache(-b->batCacheid);
        BUN l, last = BUNlast(b) - 1;
-       BUN idx1, idx2;
+       BUN idx1;
 
        ALIGNdel(b, "BUNdelete", force);        /* zap alignment info */
 
@@ -1386,12 +1341,10 @@ BUNdelete_(BAT *b, BUN p, bit force)
         * @- Committed Delete.
         * Deleting a (committed) bun: the first and deleted swap position.
         */
+       HASHdestroy(b);
        if (p < b->batInserted && !force) {
                idx1 = p;
                if (p == b->batFirst) { /* first can simply be discarded */
-                       hacc_update(hashdel,BUNhead,p,idx1);
-                       tacc_update(hashdel,BUNtail,p,idx1);
-
                        if (BAThdense(b)) {
                                bm->tseqbase = ++b->hseqbase;
                        }
@@ -1401,12 +1354,8 @@ BUNdelete_(BAT *b, BUN p, bit force)
                } else {
                        unsigned short hs = Hsize(b), ts = Tsize(b);
 
-                       hacc_update(hashdel,BUNhead,p,idx1);
-                       tacc_update(hashdel,BUNtail,p,idx1);
-
                        l = BUNfirst(b);
-                       idx2 = l;
-                       acc_move(l,p,idx2,idx1);
+                       acc_move(l,p);
                        if (b->hsorted) {
                                b->hsorted = FALSE;
                                b->H->nosorted = idx1;
@@ -1460,15 +1409,12 @@ BUNdelete_(BAT *b, BUN p, bit force)
                        (*tatmdel) (b->T->vheap, (var_t *) BUNtloc(bi, p));
                }
                idx1 = p;
-               hacc_update(hashdel,BUNhead,p,idx1);
-               tacc_update(hashdel,BUNtail,p,idx1);
-               idx2 = last;
                if (p != last) {
                        unsigned short hs = Hsize(b), ts = Tsize(b);
                        BATiter bi2 = bat_iterator(b);
 
                        /* coverity[result_independent_of_operands] */
-                       acc_move(last,p,idx2,idx1);
+                       acc_move(last,p);
                        /* If a column was sorted before the BUN was
                           deleted, check whether it is still sorted
                           afterward.  This is done by comparing the
@@ -1608,7 +1554,6 @@ BUNinplace(BAT *b, BUN p, const void *h,
                BAT *bm = BBP_cache(-b->batCacheid);
                BUN pit = p;
                BATiter bi = bat_iterator(b);
-               size_t tsize = b->tvarsized ? b->T->vheap->size : 0;
                int tt;
                BUN prv, nxt;
 
@@ -1621,9 +1566,8 @@ BUNinplace(BAT *b, BUN p, const void *h,
                         * property, so we must clear it */
                        b->T->nil = 0;
                }
-               tacc_update(hashdel,BUNtail,p,pit);
+               HASHremove(BATmirror(b));
                Treplacevalue(b, BUNtloc(bi, p), t);
-               tacc_update(hashins,BUNtail,p,pit);
 
                tt = b->ttype;
                prv = p > b->batFirst ? p - 1 : BUN_NONE;
@@ -1658,8 +1602,6 @@ BUNinplace(BAT *b, BUN p, const void *h,
                                b->T->norevsorted = pit;
                        }
                }
-               if (b->tvarsized && b->T->hash && tsize != b->T->vheap->size)
-                       HEAPwarm(b->T->vheap);
                if (((b->ttype != TYPE_void) & b->tkey & !(b->tkey & 
BOUND2BTRUE)) && b->batCount > 1) {
                        BATkey(bm, FALSE);
                }
diff --git a/gdk/gdk_delta.c b/gdk/gdk_delta.c
--- a/gdk/gdk_delta.c
+++ b/gdk/gdk_delta.c
@@ -157,22 +157,17 @@ BATundo(BAT *b)
                void (*tatmdel) (Heap *, var_t *) = BATatoms[b->ttype].atomDel;
 
                if (hunfix || tunfix || hatmdel || tatmdel || b->H->hash || 
b->T->hash) {
+                       HASHdestroy(b);
                        for (p = bunfirst; p <= bunlast; p++, i++) {
                                ptr h = BUNhead(bi, p);
                                ptr t = BUNtail(bi, p);
 
-                               if (b->H->hash) {
-                                       HASHdel(b->H->hash, i, h, p < bunlast);
-                               }
                                if (hunfix) {
                                        (*hunfix) (h);
                                }
                                if (hatmdel) {
                                        (*hatmdel) (b->H->vheap, (var_t *) 
BUNhloc(bi, p));
                                }
-                               if (b->T->hash) {
-                                       HASHdel(b->T->hash, i, t, p < bunlast);
-                               }
                                if (tunfix) {
                                        (*tunfix) (t);
                                }
diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c
--- a/gdk/gdk_group.c
+++ b/gdk/gdk_group.c
@@ -198,24 +198,40 @@
        )
 
 
+/* If a hash table exists on b we use it.
+ *
+ * The algorithm is simple.  We go through b and for each value we
+ * follow the hash chain starting at the next element after that value
+ * to find one that is equal to the value we're currently looking at.
+ * If we found such a value (including the preexisting group if we're
+ * refining), we add the value to the same group.  If we reach the end
+ * of the chain, we create a new group.
+ *
+ * If b (the original, that is) is a view on another BAT, and this
+ * other BAT has a hash, we use that.  The lo and hi values are the
+ * bounds of the parent BAT that we're considering.
+ *
+ * Note this algorithm depends critically on the fact that our hash
+ * chains go from higher to lower BUNs.
+ */
 #define GRP_use_existing_hash_table(INIT_0,INIT_1,HASH,COMP)           \
        do {                                                            \
                INIT_0;                                                 \
-               for (r = BUNfirst(b), p = r, q = r + BATcount(b);       \
+               for (r = lo, p = r, q = hi;                             \
                     p < q;                                             \
                     p++) {                                             \
                        INIT_1;                                         \
-                       /* this loop is similar, but not equal, to      \
-                        * HASHloop: the difference is that we only     \
-                        * consider BUNs smaller than the one we're     \
-                        * looking up (p), and that we also consider    \
-                        * the input groups */                          \
+                       /* this loop is similar, but not equal, to */   \
+                       /* HASHloop: the difference is that we only */  \
+                       /* consider BUNs smaller than the one we're */  \
+                       /* looking up (p), and that we also consider */ \
+                       /* the input groups */                          \
                        if (grps) {                                     \
-                               for (hb = HASHget(hs, HASH);            \
-                                    hb != HASHnil(hs);                 \
+                               for (hb = HASHgetlink(hs, p);           \
+                                    hb != HASHnil(hs) && hb >= lo;     \
                                     hb = HASHgetlink(hs, hb)) {        \
-                                       if (hb < p &&                   \
-                                           grps[hb - r] == grps[p - r] && \
+                                       assert(hb < p);                 \
+                                       if (grps[hb - r] == grps[p - r] && \
                                            COMP) {                     \
                                                oid grp = ngrps[hb - r]; \
                                                ngrps[p - r] = grp;     \
@@ -228,11 +244,11 @@
                                        }                               \
                                }                                       \
                        } else {                                        \
-                               for (hb = HASHget(hs, HASH);            \
-                                    hb != HASHnil(hs);                 \
+                               for (hb = HASHgetlink(hs, p);           \
+                                    hb != HASHnil(hs) && hb >= lo;     \
                                     hb = HASHgetlink(hs, hb)) {        \
-                                       if (hb < p &&                   \
-                                           COMP) {                     \
+                                       assert(hb < p);                 \
+                                       if (COMP) {                     \
                                                oid grp = ngrps[hb - r]; \
                                                ngrps[p - r] = grp;     \
                                                if (histo)              \
@@ -375,6 +391,7 @@ BATgroup_internal(BAT **groups, BAT **ex
        Hash *hs = NULL;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to