Changeset: db32af035ef1 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/db32af035ef1
Modified Files:
        gdk/gdk_join.c
Branch: ustr
Log Message:

Improve join of str and ustr bats.


diffs (236 lines):

diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -3193,15 +3193,17 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
        const char *lvals;
        const char *lvars;
        var_t off;
-       const void *nil = ATOMnilptr(l->ttype);
        int (*cmp)(const void *, const void *) = ATOMcompare(l->ttype);
        bool (*eq)(const void *, const void *) = ATOMequal(l->ttype);
        oid lval = oid_nil;     /* hold value if l is dense */
-       const char *v = (const char *) &lval;
+       const void *v = &lval;
        bool lskipped = false;  /* whether we skipped values in l */
        Hash *restrict hsh = NULL;
        bool locked = false;
+       bool ulocked = false;
        BUN maxsize;
+       BAT *ustr = NULL;
+       BATiter ustri;
        BAT *r1 = NULL;
        BAT *r2 = NULL;
        BAT *r3 = NULL;
@@ -3232,7 +3234,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
 
        rl = rci->seq - r->hseqbase;
        rh = canditer_last(rci) + 1 - r->hseqbase;
-       if (hash_cand || r->ustr) {
+       if (hash_cand) {
                /* we need to create a hash on r specific for the
                 * candidate list */
                char ext[32];
@@ -3248,6 +3250,22 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                if ((hsh = BAThash_impl(r, rci, false, ext)) == NULL) {
                        goto bailout;
                }
+       } else if (r->ustr) {
+               /* we use a hash on r */
+               MT_thread_setalgorithm(swapped ? "hashjoin using ustr hash 
(swapped)" : "hashjoin using ustr hash");
+               TRC_DEBUG(ALGO, ALGOBATFMT ": ustr hash%s\n",
+                         ALGOBATPAR(r),
+                         swapped ? " (swapped)" : "");
+               if (BAThash(r) != GDK_SUCCEED)
+                       goto bailout;
+               MT_rwlock_rdlock(&r->thashlock);
+               hsh = r->thash;
+               locked = true;
+               ustr = getUstrBat();
+               ustri = bat_iterator(ustr);
+               if (BAThash(ustr) != GDK_SUCCEED)
+                       goto bailout;
+               eq = ATOMequal(TYPE_oid);
        } else if (phash) {
                /* there is a hash on the parent which we should use */
                MT_thread_setalgorithm(swapped ? "hashjoin using parent hash 
(swapped)" : "hashjoin using parent hash");
@@ -3292,6 +3310,8 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                hsh = r->thash;
                locked = true;
        }
+       const void *nil = ustr ? &(var_t){0} : ATOMnilptr(l->ttype);
+
        if (locked && hsh == NULL) {
                GDKerror("Hash disappeared for "ALGOBATFMT"\n", ALGOBATPAR(r));
                goto bailout;
@@ -3308,6 +3328,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                 * set, or use a NIL mark for non-matches if r3p is
                 * set */
                if (hash_cand) {
+                       assert(ustr == NULL);
                        for (rb = HASHget(hsh, HASHprobe(hsh, nil));
                             rb != BUN_NONE;
                             rb = HASHgetlink(hsh, rb)) {
@@ -3329,6 +3350,28 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                                       __func__, t0);
                                }
                        }
+               } else if (ustr) {
+                       assert(locked);
+                       for (rb = HASHget(hsh, HASHprobe(hsh, &(var_t){0}));
+                            rb != BUN_NONE;
+                            rb = HASHgetlink(hsh, rb)) {
+                               if (rb >= rl && rb < rh &&
+                                   VarHeapVal(ri.base, rb, ri.width) == 0) {
+                                       if (r3p) {
+                                               defmark = bit_nil;
+                                               break;
+                                       }
+                                       MT_rwlock_rdunlock(&r->thashlock);
+                                       bat_iterator_end(&li);
+                                       bat_iterator_end(&ri);
+                                       BBPreclaim(b);
+                                       if (ustr)
+                                               bat_iterator_end(&ustri);
+                                       return nomatch(r1p, r2p, r3p, l, r, lci,
+                                                      bit_nil, false, false,
+                                                      __func__, t0);
+                               }
+                       }
                } else if (!BATtdensebi(&ri)) {
                        for (rb = HASHget(hsh, HASHprobe(hsh, nil));
                             rb != BUN_NONE;
@@ -3342,14 +3385,11 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                        }
                                        if (locked)
                                                
MT_rwlock_rdunlock(&r->thashlock);
-                                       if (r->ustr) {
-                                               HEAPfree(&hsh->heaplink, true);
-                                               HEAPfree(&hsh->heapbckt, true);
-                                               GDKfree(hsh);
-                                       }
                                        bat_iterator_end(&li);
                                        bat_iterator_end(&ri);
                                        BBPreclaim(b);
+                                       if (ustr)
+                                               bat_iterator_end(&ustri);
                                        return nomatch(r1p, r2p, r3p, l, r, lci,
                                                       bit_nil, false, false,
                                                       __func__, t0);
@@ -3407,6 +3447,10 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                HASHJOIN(uuid);
                break;
        default:
+               if (ustr) {
+                       MT_rwlock_rdlock(&ustr->thashlock);
+                       ulocked = true;
+               }
                while (lci->next < lci->ncand) {
                        GDK_CHECK_TIMEOUT(qry_ctx, counter,
                                        GOTO_LABEL_TIMEOUT_HANDLER(bailout, 
qry_ctx));
@@ -3415,9 +3459,25 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                lval = lo - l->hseqbase + l->tseqbase;
                        else if (li.type != TYPE_void)
                                v = VALUE(l, lo - l->hseqbase);
+                       if (ustr) {
+                               rb = BUN_NONE;
+                               HASHloop_str(&ustri, ustr->thash, rb, v)
+                                       break;
+                               if (rb == BUN_NONE)
+                                       v = NULL;
+                               else {
+                                       off = VarHeapVal(ustri.base, rb, 
ustri.width);
+                                       v = &off;
+                               }
+                       }
                        nr = 0;
                        bit mark = defmark;
-                       if ((!nil_matches || not_in) && eq(v, nil)) {
+                       if (v == NULL) {
+                               /* value did not occur in ustr bat
+                                * (value is also not nil since nil has
+                                * offset 0 and does occur) */
+                               ;
+                       } else if ((!nil_matches || not_in) && eq(v, nil)) {
                                /* no match */
                                if (not_in) {
                                        lskipped = BATcount(r1) > 0;
@@ -3425,6 +3485,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                }
                                mark = bit_nil;
                        } else if (hash_cand) {
+                               assert(ustr == NULL);
                                for (rb = HASHget(hsh, HASHprobe(hsh, v));
                                     rb != BUN_NONE;
                                     rb = HASHgetlink(hsh, rb)) {
@@ -3459,7 +3520,9 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                     rb != BUN_NONE;
                                     rb = HASHgetlink(hsh, rb)) {
                                        if (rb >= rl && rb < rh &&
-                                           (*eq)(v, BUNtail(&ri, rb)) &&
+                                           (ustr ?
+                                            off == VarHeapVal(ri.base, rb, 
ri.width)
+                                            : (*eq)(v, BUNtail(&ri, rb))) &&
                                            canditer_contains(rci, ro = (oid) 
(rb - roff + rseq))) {
                                                if (only_misses) {
                                                        nr++;
@@ -3475,7 +3538,9 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                                     rb != BUN_NONE;
                                     rb = HASHgetlink(hsh, rb)) {
                                        if (rb >= rl && rb < rh &&
-                                           (*eq)(v, BUNtail(&ri, rb))) {
+                                           (ustr ?
+                                            off == VarHeapVal(ri.base, rb, 
ri.width)
+                                            : (*eq)(v, BUNtail(&ri, rb)))) {
                                                if (only_misses) {
                                                        nr++;
                                                        break;
@@ -3534,6 +3599,10 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                        if (nr > 0 && BATcount(r1) > nr)
                                r1->trevsorted = false;
                }
+               if (ustr) {
+                       MT_rwlock_rdunlock(&ustr->thashlock);
+                       ulocked = false;
+               }
                break;
        }
        if (locked) {
@@ -3541,7 +3610,7 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
                MT_rwlock_rdunlock(&r->thashlock);
        }
 
-       if (hash_cand || r->ustr) {
+       if (hash_cand) {
                HEAPfree(&hsh->heaplink, true);
                HEAPfree(&hsh->heapbckt, true);
                GDKfree(hsh);
@@ -3571,6 +3640,8 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
        }
        bat_iterator_end(&li);
        bat_iterator_end(&ri);
+       if (ustr)
+               bat_iterator_end(&ustri);
        if (BATcount(r1) > 0) {
                if (BATtdense(r1))
                        r1->tseqbase = ((oid *) r1->theap->base)[0];
@@ -3604,15 +3675,19 @@ hashjoin(BAT **r1p, BAT **r2p, BAT **r3p
        return GDK_SUCCEED;
 
   bailout:
+       if (ulocked)
+               MT_rwlock_rdunlock(&ustr->thashlock);
        if (locked)
                MT_rwlock_rdunlock(&r->thashlock);
-       if ((hash_cand || r->ustr) && hsh) {
+       if (hash_cand && hsh) {
                HEAPfree(&hsh->heaplink, true);
                HEAPfree(&hsh->heapbckt, true);
                GDKfree(hsh);
        }
        bat_iterator_end(&li);
        bat_iterator_end(&ri);
+       if (ustr)
+               bat_iterator_end(&ustri);
        BBPreclaim(r1);
        BBPreclaim(r2);
        BBPreclaim(r3);
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to