Changeset: 5ffd9e704fb0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5ffd9e704fb0
Modified Files:
gdk/gdk_join.c
gdk/gdk_private.h
gdk/gdk_search.c
monetdb5/optimizer/opt_costModel.c
monetdb5/optimizer/opt_evaluate.c
sql/backends/monet5/UDF/pyapi/pyapi.c
sql/backends/monet5/sql_execute.c
Branch: default
Log Message:
Merge with Jul2017 branch.
diffs (truncated from 1261 to 300 lines):
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -205,8 +205,8 @@ joininitresults(BAT **r1p, BAT **r2p, BU
#define VALUE(s, x) (s##vars ? \
s##vars + VarHeapVal(s##vals, (x), s##width) : \
- s##vals + ((x) * s##width))
-#define FVALUE(s, x) (s##vals + ((x) * s##width))
+ (const char *) s##vals + ((x) * s##width))
+#define FVALUE(s, x) ((const char *) s##vals + ((x) * s##width))
#define APPEND(b, o) (((oid *) b->theap.base)[b->batCount++] = (o))
@@ -762,21 +762,6 @@ mergejoin_void(BAT *r1, BAT *r2, BAT *l,
return GDK_FAIL;
}
-/* Perform a "merge" join on l and r (if both are sorted) with
- * optional candidate lists, or join using binary search on r if l is
- * not sorted. The return BATs have already been created by the
- * caller.
- *
- * If nil_matches is set, nil values are treated as ordinary values
- * that can match; otherwise nil values never match.
- *
- * If nil_on_miss is set, a nil value is returned in r2 if there is no
- * match in r for a particular value in l (left outer join).
- *
- * If semi is set, only a single set of values in r1/r2 is returned if
- * there is a match of l in r, no matter how many matches there are in
- * r; otherwise all matches are returned.
- */
static gdk_return
mergejoin_int(BAT *r1, BAT *r2, BAT *l, BAT *r,
int nil_matches, BUN maxsize, lng t0, int swapped)
@@ -1371,22 +1356,46 @@ mergejoin_lng(BAT *r1, BAT *r2, BAT *l,
return GDK_FAIL;
}
+/* Perform a "merge" join on l and r (if both are sorted) with
+ * optional candidate lists, or join using binary search on r if l is
+ * not sorted. The return BATs have already been created by the
+ * caller.
+ *
+ * If nil_matches is set, nil values are treated as ordinary values
+ * that can match; otherwise nil values never match.
+ *
+ * If nil_on_miss is set, a nil value is returned in r2 if there is no
+ * match in r for a particular value in l (left outer join).
+ *
+ * If semi is set, only a single set of values in r1/r2 is returned if
+ * there is a match of l in r, no matter how many matches there are in
+ * r; otherwise all matches are returned.
+ *
+ * maxsize is the absolute maximum size the output BATs can become (if
+ * they were to become larger, we have a bug).
+ *
+ * t0 and swapped are only for debugging (ALGOMASK set in GDKdebug).
+ */
static gdk_return
mergejoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr,
int nil_matches, int nil_on_miss, int semi, int only_misses,
BUN maxsize, lng t0, int swapped)
{
- BUN lstart, lend, lcnt;
+ BUN lstart, lend;
const oid *lcand, *lcandend;
- BUN rstart, rend, rcnt, rstartorig;
+ BUN rstart, rend, rstartorig;
+ BUN lcnt, rcnt; /* not actively used, but needed for CANDINIT */
const oid *rcand, *rcandend, *rcandorig;
+ /* [lr]scan determine how far we look ahead in l/r in order to
+ * decide whether we want to do a binary search or a scan */
BUN lscan, rscan;
- const char *lvals, *rvals;
- const char *lvars, *rvars;
- int lwidth, rwidth;
+ const void *lvals, *rvals; /* the values of l/r (NULL if dense) */
+ const char *lvars, *rvars; /* the indirect values (NULL if fixed size)
*/
+ int lwidth, rwidth; /* width of the values */
const void *nil = ATOMnilptr(l->ttype);
int (*cmp)(const void *, const void *) = ATOMcompare(l->ttype);
- const char *v, *prev = NULL;
+ const void *v; /* points to value under consideration */
+ const void *prev = NULL;
BUN nl, nr;
BUN total; /* number of rows in l we scan */
int insert_nil;
@@ -1400,10 +1409,10 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* l/r: it determines the comparison function used */
int lordering, rordering;
oid lv;
- BUN i;
+ BUN i, j; /* counters */
int lskipped = 0; /* whether we skipped values in l */
- lng loff = 0, roff = 0;
- oid lval = oid_nil, rval = oid_nil;
+ lng loff = 0, roff = 0; /* set if l/r is dense */
+ oid lval = oid_nil, rval = oid_nil; /* temporary space to point v to */
if (sl == NULL && sr == NULL && !nil_on_miss &&
!semi && !only_misses && l->tsorted && r->tsorted && r2 != NULL) {
@@ -1449,8 +1458,8 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
CANDINIT(l, sl, lstart, lend, lcnt, lcand, lcandend);
CANDINIT(r, sr, rstart, rend, rcnt, rcand, rcandend);
total = lcand ? (BUN) (lcandend - lcand) : lend - lstart;
- lvals = l->ttype == TYPE_void ? NULL : (const char *) Tloc(l, 0);
- rvals = r->ttype == TYPE_void ? NULL : (const char *) Tloc(r, 0);
+ lvals = BATtvoid(l) ? NULL : Tloc(l, 0);
+ rvals = BATtvoid(r) ? NULL : Tloc(r, 0);
if (l->tvarsized && l->ttype) {
assert(r->tvarsized && r->ttype);
lvars = l->tvheap->base;
@@ -1468,14 +1477,14 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
if (lstart == lend ||
rstart == rend ||
(!nil_matches &&
- ((l->ttype == TYPE_void && l->tseqbase == oid_nil) ||
- (r->ttype == TYPE_void && r->tseqbase == oid_nil))) ||
- (l->ttype == TYPE_void && l->tseqbase == oid_nil &&
+ ((BATtvoid(l) && l->tseqbase == oid_nil) ||
+ (BATtvoid(r) && r->tseqbase == oid_nil))) ||
+ (BATtvoid(l) && l->tseqbase == oid_nil &&
(r->tnonil ||
- (r->ttype == TYPE_void && r->tseqbase != oid_nil))) ||
- (r->ttype == TYPE_void && r->tseqbase == oid_nil &&
+ (BATtvoid(r) && r->tseqbase != oid_nil))) ||
+ (BATtvoid(r) && r->tseqbase == oid_nil &&
(l->tnonil ||
- (l->ttype == TYPE_void && l->tseqbase != oid_nil)))) {
+ (BATtvoid(l) && l->tseqbase != oid_nil)))) {
/* there are no matches */
return nomatch(r1, r2, l, r, lstart, lend, lcand, lcandend,
nil_on_miss, only_misses, "mergejoin", t0);
@@ -1490,13 +1499,13 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
nl >>= 1;
equal_order = (l->tsorted && r->tsorted) ||
(l->trevsorted && r->trevsorted &&
- l->ttype != TYPE_void && r->ttype != TYPE_void);
+ !BATtvoid(l) && !BATtvoid(r));
lordering = l->tsorted && (r->tsorted || !equal_order) ? 1 : -1;
rordering = equal_order ? lordering : -lordering;
} else {
/* if l not sorted, we will always use binary search
* on r */
- assert(l->ttype != TYPE_void); /* void is always sorted */
+ assert(!BATtvoid(l)); /* void is always sorted */
lscan = 0;
equal_order = 1;
lordering = 1;
@@ -1516,34 +1525,49 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
rscan++)
nl >>= 1;
- if (l->ttype == TYPE_void) {
+ if (BATtvoid(l)) {
+ assert(lvals == NULL);
if (lcand) {
lstart = 0;
lend = (BUN) (lcandend - lcand);
- lvals = (const char *) lcand;
- lcand = NULL;
- lwidth = SIZEOF_OID;
}
if (l->tseqbase == oid_nil)
loff = lng_nil;
else
loff = (lng) l->tseqbase - (lng) l->hseqbase;
}
- if (r->ttype == TYPE_void) {
+ if (BATtvoid(r)) {
+ assert(rvals == NULL);
if (rcand) {
rstart = 0;
rend = (BUN) (rcandend - rcand);
- rvals = (const char *) rcand;
- rcand = NULL;
- rwidth = SIZEOF_OID;
}
if (r->tseqbase == oid_nil)
roff = lng_nil;
else
roff = (lng) r->tseqbase - (lng) r->hseqbase;
}
- assert(lvals != NULL || lcand == NULL);
- assert(rvals != NULL || rcand == NULL);
+ assert(loff == 0 || lvals == NULL);
+ assert(roff == 0 || rvals == NULL);
+ /* At this point the various variables that help us through
+ * the algorithm have been set. The table explains them. The
+ * first two columns are the inputs, the next three columns
+ * are the variables, the final two columns indicate how the
+ * variables can be used.
+ *
+ * l/r sl/sr | vals cand off | result value being matched
+ * -------------+-----------------+----------------------------------
+ * dense NULL | NULL NULL set | i off==nil?nil:i+off
+ * dense dense | NULL NULL set | i off==nil?nil:i+off
+ * dense set | NULL set set | cand[i] off==nil?nil:cand[i]+off
+ * set NULL | set NULL 0 | i vals[i]
+ * set dense | set NULL 0 | i vals[i]
+ * set set | set set 0 | cand[i] vals[cand[i]]
+ *
+ * If {l,r}off is lng_nil, all values in the corresponding bat
+ * are oid_nil because the bat has type VOID and the tseqbase
+ * is nil.
+ */
rcandorig = rcand;
rstartorig = rstart;
@@ -1571,34 +1595,36 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
* value in r.
* The next value to match in r is the first
* if equal_order is set, the last
- * otherwise. */
+ * otherwise.
+ * When skipping over values in l, we count
+ * how many we skip in nlx. We need this in
+ * case only_misses or nil_on_miss is set, and
+ * to properly set the tdense property in the
+ * first output BAT. */
BUN nlx = 0; /* number of non-matching values in l */
if (rcand) {
- if (rcand == rcandend)
- v = NULL;
- else
+ if (rcand == rcandend) {
+ v = NULL; /* no more values */
+ } else if (roff != 0) {
+ rval = roff == lng_nil ? oid_nil :
(oid) ((lng) (equal_order ? rcand[0] : rcandend[-1]) + roff);
+ v = &rval;
+ } else {
v = VALUE(r, (equal_order ? rcand[0] :
rcandend[-1]) - r->hseqbase);
+ }
} else {
if (rstart == rend) {
v = NULL;
} else if (rvals) {
v = VALUE(r, equal_order ? rstart :
rend - 1);
- if (roff == lng_nil) {
- rval = oid_nil;
- v = (const char *) &rval;
- } else if (roff != 0) {
- rval = (oid) (*(const oid *)v +
roff);
- v = (const char *) &rval;
- }
} else {
if (roff == lng_nil)
rval = oid_nil;
else if (equal_order)
- rval = rstart + r->tseqbase;
+ rval = (oid) ((lng) rstart +
r->tseqbase);
else
- rval = rend - 1 + r->tseqbase;
- v = (const char *) &rval;
+ rval = (oid) ((lng) rend - 1 +
r->tseqbase);
+ v = &rval;
}
}
/* here, v points to next value in r, or if
@@ -1611,46 +1637,50 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT
nlx = lend - lstart;
lstart = lend;
}
+ } else if (loff == lng_nil) {
+ /* all l values are NIL, and the type is OID */
+ if (* (oid *) v != oid_nil) {
+ /* value we're looking at in r
+ * is not NIL, so we match
+ * nothing */
+ if (lcand) {
+ nlx = (BUN) (lcandend - lcand);
+ lcand = lcandend;
+ } else {
+ nlx = lend - lstart;
+ lstart = lend;
+ }
+ v = NULL;
+ }
} else if (lcand) {
- if (lscan < (BUN) (lcandend - lcand) &&
- lordering * cmp(VALUE(l, lcand[lscan] -
l->hseqbase),
- v) < 0) {
- nlx = binsearch(lcand, l->hseqbase,
- l->ttype, lvals, lvars,
- lwidth, lscan,
- (BUN) (lcandend -
lcand), v,
- lordering, 0);
+ if (lscan < (BUN) (lcandend - lcand)) {
+ if (lvals) {
+ if (lordering * cmp(VALUE(l,
lcand[lscan] - l->hseqbase), v) < 0) {
+ nlx = binsearch(lcand,
l->hseqbase, l->ttype, lvals, lvars, lwidth, lscan, (BUN) (lcandend - lcand),
v, lordering, 0);
+ }
+ } else {
+ lval = (oid) ((lng) *(const
oid*)v - loff);
+ if (lordering > 0 ?
lcand[lscan] < lval : lcand[lscan] > lval) {
+ nlx = binsearch(NULL,
0, TYPE_oid, (const char *) lcand, NULL, SIZEOF_OID, 0, lcandend - lcand,
&lval, 1, 0);
+ }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list