Changeset: e13766e9dba6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e13766e9dba6
Modified Files:
        gdk/gdk_hash.c
        gdk/gdk_join.c
        gdk/gdk_private.h
Branch: ustr
Log Message:

Implemented join using offsets only if vheap is duplicate-free.


diffs (truncated from 468 to 300 lines):

diff --git a/gdk/gdk_hash.c b/gdk/gdk_hash.c
--- a/gdk/gdk_hash.c
+++ b/gdk/gdk_hash.c
@@ -721,9 +721,12 @@ BAThashsave(BAT *b, bool dosync)
  * If a candidate list s is also given, the hash table is specific for
  * the combination of the two: only values from b that are referred to
  * by s are included in the hash table, so if a result is found when
- * searching the hash table, the result is a candidate. */
+ * searching the hash table, the result is a candidate.
+ * If offsets is set, the hash is built on the theap values and not the
+ * values in the tvheap that they point to. */
 Hash *
-BAThash_impl(BAT *restrict b, struct canditer *restrict ci, const char 
*restrict ext)
+BAThash_impl(BAT *restrict b, struct canditer *restrict ci,
+            bool offsets, const char *restrict ext)
 {
        lng t0 = 0;
        BUN cnt1;
@@ -735,6 +738,27 @@ BAThash_impl(BAT *restrict b, struct can
        const char *nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
        BATiter bi = bat_iterator(b);
        unsigned int tpe = ATOMbasetype(bi.type);
+       if (offsets) {
+               assert(b->tvheap);
+               switch (bi.width) {
+               case 1:
+                       tpe = TYPE_bte;
+                       break;
+               case 2:
+                       tpe = TYPE_sht;
+                       break;
+               case 4:
+                       tpe = TYPE_int;
+                       break;
+#if SIZEOF_VAR_T == 8
+               case 8:
+                       tpe = TYPE_lng;
+                       break;
+#endif
+               default:
+                       MT_UNREACHABLE();
+               }
+       }
        bool hascand = ci->tpe != cand_dense || ci->ncand != bi.count;
 
        QryCtx *qry_ctx = MT_thread_get_qry_ctx();
@@ -1046,7 +1070,7 @@ BAThash(BAT *b)
        if (b->thash == NULL) {
                struct canditer ci;
                canditer_init(&ci, b, NULL);
-               if ((b->thash = BAThash_impl(b, &ci, "thash")) == NULL) {
+               if ((b->thash = BAThash_impl(b, &ci, false, "thash")) == NULL) {
                        MT_rwlock_wrunlock(&b->thashlock);
                        return GDK_FAIL;
                }
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -2773,6 +2773,295 @@ mergejoin(BAT **r1p, BAT **r2p, BAT **r3
                nr++;                                                   \
        } while (false)
 
+static gdk_return
+vkeyjoin(BAT **r1p, BAT **r2p, BAT **r3p, BAT *l, BAT *r,
+        struct canditer *restrict lci, struct canditer *restrict rci,
+        bool nil_matches, bool nil_on_miss, bool semi, bool only_misses,
+        bool not_in, bool max_one, bool min_one,
+        BUN estimate, lng t0, bool swapped,
+        const char *reason)
+{
+       BAT *r1 = NULL;
+       BAT *r2 = NULL;
+       BAT *r3 = NULL;
+
+       assert(l->tvheap);
+       assert(r->tvheap);
+       assert(l->tvheap->parentid == r->tvheap->parentid);
+       assert(BBP_desc(VIEWvtparent(l))->tvkey);
+       assert(BBP_desc(VIEWvtparent(r))->tvkey);
+
+       size_t counter = 0;
+       QryCtx *qry_ctx = MT_thread_get_qry_ctx();
+
+       Hash *hsh;
+       char ext[32];
+       if (snprintf(ext, sizeof(ext), "thshjn%x",
+                    (unsigned) MT_getpid()) >= (int) sizeof(ext) ||
+           (hsh = BAThash_impl(r, rci, true, ext)) == NULL)
+               return GDK_FAIL;
+
+       BATiter li = bat_iterator(l);
+       BATiter ri = bat_iterator(r);
+
+       var_t niloff = GDK_ELIMDOUBLES(li.vh) ? strLocate(li.vh, str_nil) : 0;
+       assert(niloff != (var_t) -2);
+
+       bit defmark = 0;
+       if ((not_in || r3p) && !ri.nonil) {
+               BUN rb;
+               union {
+                       bte b;
+                       sht s;
+                       int i;
+                       lng l;
+               } u;
+               const void *nil;
+               switch (ri.width) {
+               case 1:
+                       u.b = niloff == 0 ? 0 : niloff - GDK_VAROFFSET;
+                       nil = &u.b;
+                       break;
+               case 2:
+                       u.s = niloff == 0 ? 0 : niloff - GDK_VAROFFSET;
+                       nil = &u.s;
+                       break;
+               case 4:
+                       u.i = niloff;
+                       nil = &u.i;
+                       break;
+               case 8:
+                       u.l = niloff;
+                       nil = &u.l;
+                       break;
+               }
+               HASHlooploc(&ri, hsh, rb, nil) {
+                       if (r3p) {
+                               defmark = bit_nil;
+                               break;
+                       }
+                       HEAPfree(&hsh->heaplink, true);
+                       HEAPfree(&hsh->heapbckt, true);
+                       GDKfree(hsh);
+                       bat_iterator_end(&li);
+                       bat_iterator_end(&ri);
+                       return nomatch(r1p, r2p, r3p, l, r, lci,
+                                      bit_nil, false, false,
+                                      __func__, t0);
+               }
+       }
+
+       BUN maxsize = joininitresults(r1p, r2p, r3p, lci->ncand, rci->ncand,
+                                     li.key, ri.key, semi | max_one,
+                                     nil_on_miss, only_misses, min_one,
+                                     estimate);
+       if (maxsize == BUN_NONE) {
+               goto bailout;
+       }
+
+       /* from here on, semi is used to bail out early from the
+        * collision lists; if right is key, it's effectively a
+        * semi-join, and max_one is automatically satisfied; otherwise,
+        * we need to continue looking if max_one is specified to make
+        * sure there is only one match */
+       if (r->tkey)
+               semi = true;
+       else if (max_one)
+               semi = false;
+
+       r1 = *r1p;
+       r2 = r2p ? *r2p : NULL;
+       r3 = r3p ? *r3p : NULL;
+
+       /* basic properties will be adjusted if necessary later on,
+        * they were initially set by joininitresults() */
+
+       if (r2) {
+               r2->tkey = li.key;
+               /* r2 is not likely to be sorted (although it is
+                * certainly possible) */
+               r2->tsorted = false;
+               r2->trevsorted = false;
+               r2->tseqbase = oid_nil;
+       }
+
+       if (lci->tpe != cand_dense)
+               r1->tseqbase = oid_nil;
+
+       bool lskipped = false;
+       while (lci->next < lci->ncand) {
+               GDK_CHECK_TIMEOUT(qry_ctx, counter, 
GOTO_LABEL_TIMEOUT_HANDLER(bailout, qry_ctx));
+               oid lo = canditer_next(lci);
+               var_t v = VarHeapVal(li.base, lo - l->hseqbase, li.width);
+               BUN nr = 0;
+               bit mark = defmark;
+               if ((!nil_matches || not_in) && v == niloff) {
+                       /* no match */
+                       if (not_in) {
+                               lskipped = BATcount(r1) > 0;
+                               continue;
+                       }
+                       mark = bit_nil;
+               } else {
+                       BUN x;
+                       switch (ri.width) {
+                       case 1: {
+                               uint8_t b = v == 0 ? 0 : (uint8_t) (v - 
GDK_VAROFFSET);
+                               x = hash_bte(hsh, &b);
+                               break;
+                       }
+                       case 2: {
+                               uint16_t s = v == 0 ? 0 : (uint16_t) (v - 
GDK_VAROFFSET);
+                               x = hash_sht(hsh, &s);
+                               break;
+                       }
+                       case 4: {
+                               uint32_t i = (uint32_t) v;
+                               x = hash_int(hsh, &i);
+                               break;
+                       }
+                       case 8: {
+                               uint64_t l = (uint64_t) v;
+                               x = hash_lng(hsh, &l);
+                               break;
+                       }
+                       }
+                       for (BUN rb = HASHget(hsh, x);
+                            rb != BUN_NONE;
+                            rb = HASHgetlink(hsh, rb)) {
+                               oid ro = canditer_idx(rci, rb);
+                               if (v != VarHeapVal(ri.base, ro - r->hseqbase, 
ri.width))
+                                       continue;
+                               if (only_misses) {
+                                       nr++;
+                                       break;
+                               }
+                               HASHLOOPBODY();
+                               if (semi)
+                                       break;
+                       }
+               }
+               if (nr == 0) {
+                       if (only_misses) {
+                               nr = 1;
+                               if (maybeextend(r1, r2, r3, 1, lci->next, 
lci->ncand, maxsize) != GDK_SUCCEED)
+                                       goto bailout;
+                               APPEND(r1, lo);
+                               if (lskipped)
+                                       r1->tseqbase = oid_nil;
+                       } else if (nil_on_miss) {
+                               nr = 1;
+                               if (maybeextend(r1, r2, r3, 1, lci->next, 
lci->ncand, maxsize) != GDK_SUCCEED)
+                                       goto bailout;
+                               APPEND(r1, lo);
+                               if (r2) {
+                                       r2->tnil = true;
+                                       r2->tnonil = false;
+                                       r2->tkey = false;
+                                       APPEND(r2, oid_nil);
+                               }
+                               if (r3) {
+                                       r3->tnil |= mark == bit_nil;
+                                       ((bit *) 
r3->theap->base)[r3->batCount++] = mark;
+                               }
+                       } else if (min_one) {
+                               GDKerror("not enough matches");
+                               goto bailout;
+                       } else {
+                               lskipped = BATcount(r1) > 0;
+                       }
+               } else if (only_misses) {
+                       lskipped = BATcount(r1) > 0;
+               } else {
+                       if (lskipped) {
+                               /* note, we only get here in an
+                                * iteration *after* lskipped was
+                                * first set to true, i.e. we did
+                                * indeed skip values in l */
+                               r1->tseqbase = oid_nil;
+                       }
+                       if (nr > 1) {
+                               r1->tkey = false;
+                               r1->tseqbase = oid_nil;
+                       }
+               }
+               if (nr > 0 && BATcount(r1) > nr)
+                       r1->trevsorted = false;
+       }
+
+       HEAPfree(&hsh->heaplink, true);
+       HEAPfree(&hsh->heapbckt, true);
+       GDKfree(hsh);
+
+       /* also set other bits of heap to correct value to indicate size */
+       BATsetcount(r1, BATcount(r1));
+       if (BATcount(r1) <= 1) {
+               r1->tsorted = true;
+               r1->trevsorted = true;
+               r1->tkey = true;
+               r1->tseqbase = 0;
+       }
+       if (r2) {
+               BATsetcount(r2, BATcount(r2));
+               assert(BATcount(r1) == BATcount(r2));
+               if (BATcount(r2) <= 1) {
+                       r2->tsorted = true;
+                       r2->trevsorted = true;
+                       r2->tkey = true;
+                       r2->tseqbase = 0;
+               }
+       }
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to