Changeset: f9e92f9706cb for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f9e92f9706cb Modified Files: clients/Tests/exports.stable.out gdk/ChangeLog gdk/gdk.h gdk/gdk_bat.c gdk/gdk_join.c gdk/gdk_logger.c gdk/gdk_setop.c sql/backends/monet5/sql.c sql/storage/bat/bat_table.c sql/test/leaks/Tests/check1.stable.out sql/test/leaks/Tests/check1.stable.out.int128 sql/test/leaks/Tests/check2.stable.out sql/test/leaks/Tests/check2.stable.out.int128 sql/test/leaks/Tests/check3.stable.out sql/test/leaks/Tests/check3.stable.out.int128 sql/test/leaks/Tests/check4.stable.out sql/test/leaks/Tests/check4.stable.out.int128 sql/test/leaks/Tests/check5.stable.out sql/test/leaks/Tests/check5.stable.out.int128 sql/test/leaks/Tests/drop3.stable.out sql/test/leaks/Tests/drop3.stable.out.int128 sql/test/leaks/Tests/select1.stable.out sql/test/leaks/Tests/select1.stable.out.int128 sql/test/leaks/Tests/select2.stable.out sql/test/leaks/Tests/select2.stable.out.int128 sql/test/leaks/Tests/temp1.stable.out sql/test/leaks/Tests/temp1.stable.out.int128 sql/test/leaks/Tests/temp2.stable.out sql/test/leaks/Tests/temp2.stable.out.int128 sql/test/leaks/Tests/temp3.stable.out sql/test/leaks/Tests/temp3.stable.out.int128 Branch: default Log Message:
Implemented BATsubdiff, replacing BATkdiff. diffs (truncated from 2621 to 300 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -188,6 +188,7 @@ BAT *BATssort(BAT *b); BAT *BATssort_rev(BAT *b); gdk_return BATsubbandjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, const void *c1, const void *c2, int li, int hi, BUN estimate); gdk_return BATsubcross(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr); +BAT *BATsubdiff(BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_return BATsubleftjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); diff --git a/gdk/ChangeLog b/gdk/ChangeLog --- a/gdk/ChangeLog +++ b/gdk/ChangeLog @@ -1,6 +1,12 @@ # ChangeLog file for MonetDB # This file is updated with Maddlog +* Sat Sep 5 2015 Sjoerd Mullender <[email protected]> +- Implemented BATsubdiff which returns a list of OIDs (sorted, i.e. usable + as candidate list) of tuples in the left input whose value does not + occur in the right input. Reimplemented BATkdiff (to be removed later) + using this new function. + * Fri Sep 4 2015 Sjoerd Mullender <[email protected]> - Removed function BATkintersect. It wasn't used anymore. It's functionality can be obtained by using BATsubsemijoin. diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -3176,6 +3176,7 @@ gdk_export gdk_return BATsubleftjoin(BAT gdk_export gdk_return BATsubouterjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_export gdk_return BATsubthetajoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int op, int nil_matches, BUN estimate); gdk_export gdk_return BATsubsemijoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); +gdk_export BAT *BATsubdiff(BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_export gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_export gdk_return BATsubleftfetchjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate); gdk_export gdk_return BATsubbandjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, const void *c1, const void *c2, int li, int hi, BUN estimate); diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -2101,6 +2101,34 @@ BATsetcount(BAT *b, BUN cnt) b->hsorted = b->hrevsorted = ATOMlinear(b->htype) != 0; b->tsorted = b->trevsorted = ATOMlinear(b->ttype) != 0; } + if (b->htype == TYPE_void) { + b->hsorted = 1; + if (b->hseqbase == oid_nil) { /* unlikely */ + b->hkey = cnt <= 1; + b->hrevsorted = 1; + b->H->nil = 1; + b->H->nonil = 0; + } else { + b->hkey = 1; + b->hrevsorted = cnt <= 1; + b->H->nil = 0; + b->H->nonil = 1; + } + } + if (b->ttype == TYPE_void) { + b->tsorted = 1; + if (b->tseqbase == oid_nil) { + b->tkey = cnt <= 1; + b->trevsorted = 1; + b->T->nil = 1; + b->T->nonil = 0; + } else { + b->tkey = 1; + b->trevsorted = cnt <= 1; + b->T->nil = 0; + b->T->nonil = 1; + } + } assert(b->batCapacity >= cnt); } diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -65,6 +65,17 @@ * values match if the left value is between two corresponding * right values; two extra Boolean parameters, li and hi, * indicate whether equal values match + * + * In addition to these functions, there are two more functions that + * are closely related: + * BATsubdiff + * difference: return a candidate list compatible list of OIDs of + * tuples in the left input whose value does not occur in the + * right input + * BATproject + * projection: return a BAT aligned with the left input whose + * values are the values from the right input that were referred + * to by the OIDs in the tail of the left input */ /* Perform a bunch of sanity checks on the inputs to a join. */ @@ -107,9 +118,12 @@ joinparamcheck(BAT *l, BAT *r1, BAT *r2, return GDK_SUCCEED; } -/* Create the result bats for a join. */ +/* Create the result bats for a join, returns the absolute maximum + * number of outputs that could possibly be generated. */ static BUN -joininitresults(BAT **r1p, BAT **r2p, BUN lcnt, BUN rcnt, int lkey, int rkey, int semi, int nil_on_miss, BUN estimate, const char *func) +joininitresults(BAT **r1p, BAT **r2p, BUN lcnt, BUN rcnt, int lkey, int rkey, + int semi, int nil_on_miss, int only_misses, BUN estimate, + const char *func) { BAT *r1, *r2; BUN maxsize, size; @@ -117,7 +131,10 @@ joininitresults(BAT **r1p, BAT **r2p, BU lkey |= lcnt <= 1; rkey |= rcnt <= 1; - if (rkey | semi) { + *r1p = NULL; + if (r2p) + *r2p = NULL; + if (rkey | semi | only_misses) { /* each entry left matches at most one on right, in * case nil_on_miss is also set, each entry matches * exactly one */ @@ -143,32 +160,41 @@ joininitresults(BAT **r1p, BAT **r2p, BU size = estimate == BUN_NONE ? lcnt : estimate; if (size > maxsize) size = maxsize; + if ((rkey | semi | only_misses) & nil_on_miss) { + /* see comment above: each entry left matches exactly + * once */ + size = maxsize; + } r1 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT); - r2 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT); - if (r1 == NULL || r2 == NULL) { - BBPreclaim(r1); - BBPreclaim(r2); - *r1p = *r2p = NULL; + if (r1 == NULL) { GDKerror("%s: cannot create output BATs.\n", func); return BUN_NONE; } BATseqbase(r1, 0); - BATseqbase(r2, 0); r1->T->nil = 0; r1->T->nonil = 1; r1->tkey = 1; r1->tsorted = 1; r1->trevsorted = 1; r1->tdense = 1; - r2->T->nil = 0; - r2->T->nonil = 1; - r2->tkey = 1; - r2->tsorted = 1; - r2->trevsorted = 1; - r2->tdense = 1; *r1p = r1; - *r2p = r2; + if (r2p) { + r2 = BATnew(TYPE_void, TYPE_oid, size, TRANSIENT); + if (r2 == NULL) { + BBPreclaim(r1); + GDKerror("%s: cannot create output BATs.\n", func); + return BUN_NONE; + } + BATseqbase(r2, 0); + r2->T->nil = 0; + r2->T->nonil = 1; + r2->tkey = 1; + r2->tsorted = 1; + r2->trevsorted = 1; + r2->tdense = 1; + *r2p = r2; + } return maxsize; } @@ -435,11 +461,11 @@ binsearch(const oid *rcand, oid offset, static gdk_return nomatch(BAT *r1, BAT *r2, BAT *l, BAT *r, BUN lstart, BUN lend, const oid *lcand, const oid *lcandend, - int nil_on_miss, int must_match, const char *func) + int nil_on_miss, int only_misses, int must_match, const char *func) { BUN cnt; - if (lstart == lend || (!must_match && !nil_on_miss)) { + if (lstart == lend || (!must_match && !(nil_on_miss | only_misses))) { virtualize(r1); virtualize(r2); return GDK_SUCCEED; @@ -474,15 +500,19 @@ nomatch(BAT *r1, BAT *r2, BAT *l, BAT *r BATsetcount(r1, cnt); BATseqbase(BATmirror(r1), lstart + l->hseqbase); } - HEAPfree(&r2->T->heap, 1); - r2->ttype = TYPE_void; - r2->tvarsized = 1; - r2->T->width = 0; - r2->T->shift = 0; - r2->tdense = 0; - BATextend(r2, cnt); - BATsetcount(r2, cnt); - BATseqbase(BATmirror(r2), oid_nil); + BATseqbase(r1, 0); + if (r2) { + HEAPfree(&r2->T->heap, 1); + r2->ttype = TYPE_void; + r2->tvarsized = 1; + r2->T->width = 0; + r2->T->shift = 0; + r2->tdense = 0; + BATextend(r2, cnt); + BATsetcount(r2, cnt); + BATseqbase(BATmirror(r2), oid_nil); + BATseqbase(r2, 0); + } ALGODEBUG fprintf(stderr, "#%s(l=%s,r=%s)=(%s#"BUNFMT"%s%s,%s#"BUNFMT"%s%s) -- nomatch\n", func, @@ -490,15 +520,15 @@ nomatch(BAT *r1, BAT *r2, BAT *l, BAT *r BATgetId(r1), BATcount(r1), r1->tsorted ? "-sorted" : "", r1->trevsorted ? "-revsorted" : "", - BATgetId(r2), BATcount(r2), - r2->tsorted ? "-sorted" : "", - r2->trevsorted ? "-revsorted" : ""); + r2 ? BATgetId(r2) : "--", r2 ? BATcount(r2) : 0, + r2 && r2->tsorted ? "-sorted" : "", + r2 && r2->trevsorted ? "-revsorted" : ""); return GDK_SUCCEED; } static gdk_return mergejoin_void(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, - int nil_on_miss, int must_match) + int nil_on_miss, int only_misses, int must_match) { oid lo, hi; BUN cnt, i; @@ -575,10 +605,10 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l, seq - l->hseqbase, seq + cnt - l->hseqbase, NULL, NULL, nil_on_miss, - must_match, + only_misses, must_match, "mergejoin_void"); if (must_match && hi - lo < cnt) { - GDKerror("mergejoin(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); + GDKerror("mergejoin_void(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); goto bailout; } @@ -590,6 +620,48 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l, * tail since all values in l will match * something (even if nil if nil_on_miss is * set) */ + if (only_misses) { + /* the return values are + * [seq..lo') + [hi'..seq+cnt) + * where lo' and hi' are lo and hi + * translated back to l's head + * values */ + lo = lo + l->hseqbase - l->tseqbase; /* lo' */ + hi = hi + l->hseqbase - l->tseqbase; /* hi' */ + assert(lo >= seq); + assert(hi <= seq + cnt); + if (lo == seq || hi == seq + cnt) { + /* one of [seq..lo') and + * [hi'..seq+cnt) is empty, so + * the result is the other + * range and thus dense */ + HEAPfree(&r1->T->heap, 1); + r1->ttype = TYPE_void; + r1->tvarsized = 1; + r1->T->width = 0; + r1->T->shift = 0; + r1->tdense = 0; + BATextend(r1, cnt - (hi - lo)); + BATsetcount(r1, cnt - (hi - lo)); + BATseqbase(BATmirror(r1), lo == seq ? hi : seq); + } else { + if (BATextend(r1, cnt - (hi - lo)) != GDK_SUCCEED) + goto bailout; + for (o = seq; o < lo; o++) + APPEND(r1, o); + seq += cnt; + for (o = hi; o < seq; o++) + APPEND(r1, o); + BATsetcount(r1, cnt - (hi - lo)); + r1->tsorted = 1; + r1->trevsorted = 0; + r1->tdense = 0; + r1->tkey = 1; + r1->T->nil = 0; + r1->T->nonil = 1; + } + goto doreturn; _______________________________________________ checkin-list mailing list [email protected] https://www.monetdb.org/mailman/listinfo/checkin-list
