Update of /cvsroot/monetdb/pathfinder/compiler/algebra/map
In directory sc8-pr-cvs16:/tmp/cvs-serv26652/algebra/map
Modified Files:
Makefile.ag intro_proxy.c map_ori_names.c map_unq_names.c
Added Files:
intro_thetajoin.c
Log Message:
Thetajoin operator:
Introduced a thetajoin operator in the logical algebra (and the physical
algebra and the MIL generation). This new thetajoin can handle a list of
conjunctive predicates where each predicate represents an (in-)equality
condition.
Thetajoin introduction:
A thetajoin operator is introduced either by a small pattern
(select-comparison-cross) in opt_general.brg or by a new introduction
phase in intro_thetajoin.c. The latter introduction phase uses a
selection as a basis to find the *correct* equi-join and transforms
this equi-join into a thetajoin (for more details please look into
the code).
Thetajoin optimization:
Similar to the MVD optimization phase that pushes cross product
operators up in the DAG structure this checkin provides a new
optimization phase that pushes thetajoin operators up in the DAG
(opt_thetajoin.c). These rewrites result in thetajoin operators
that contain multiple predicates. (E.g. the transformed equi-joins
eventually hit the selection that triggered the rewrite and thus form
a new scope-dependent value-based join -- see XMark Q3 and Q4.)
Additional changes:
opt_general.brg
* Extended thetajoin pattern to cope with inequality comparisons
* Rewrite the pattern: semijoin (distinct (Rel), Rel) into
distinct (semijoin (Rel, Rel) as the semijoin hopefully reduces
the cardinality more than the distinct operator. (This speeds
up XMark Q12 by a factor of 2.5.)
intro_proxy.c
* Added a preceding phase that removes all semijoin operators.
(Using only simply checks more proxies are now detected.)
* Removed unnest optimization phase (thetajoin optimization
does the same in a more general scenario).
physical.c
* Toyed around with the cost model
* Fixed 'PFord_order_dir_at' bug
planner.c
* Generated more ordered alternatives (even if an ordering is
not required) after an operator is mapped to physical algebra.
In some situations this avoids in a later step planning the same
sort operator multiple times. (We only introduce orderings for
iter-like columns and sort at most two columns. Otherwise the
plan space explodes for some sample queries.)
* Extended list of possible orderings for the rownum operator
Index: intro_proxy.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/intro_proxy.c,v
retrieving revision 1.24
retrieving revision 1.25
diff -u -d -r1.24 -r1.25
--- intro_proxy.c 2 Apr 2007 20:14:53 -0000 1.24
+++ intro_proxy.c 7 May 2007 10:17:08 -0000 1.25
@@ -64,8 +64,81 @@
#define pfIN(p) ((p)->bit_in)
#define pfOUT(p) ((p)->bit_out)
+/**
+ * worker for remove_semijoin_operators() that
+ * replaces semijoin operators by equi-joins and
+ * a distinct.
+ */
+static void
+remove_semijoin_worker (PFla_op_t *p)
+{
+ assert (p);
+
+ /* nothing to do if we already visited that node */
+ if (SEEN(p))
+ return;
+
+ for (unsigned int i = 0; i < PFLA_OP_MAXCHILD && p->child[i]; i++)
+ remove_semijoin_worker (p->child[i]);
+ if (p->kind == la_semijoin) {
+ /* we need to additional projections:
+ top to remove the right join argument from the result list
+ right to avoid name conflicts with the left input columns
+ and to ensure that distinct is applied only on the join
+ column */
+ PFalg_proj_t *top = PFmalloc (p->schema.count *
+ sizeof (PFalg_proj_t)),
+ *right = PFmalloc (sizeof (PFalg_proj_t));
+ PFalg_att_t used_cols = 0,
+ new_col = p->sem.eqjoin.att2,
+ cur_col;
+ /* fill the 'top' projection and collect all used columns
+ of the left input relation (equals the semijoin output
+ schema) */
+ for (unsigned int i = 0; i < p->schema.count; i++) {
+ cur_col = p->schema.items[i].name;
+
+ top[i] = PFalg_proj (cur_col, cur_col);
+ used_cols |= cur_col;
+ }
+ /* rename the join argument in case a name conflict occurs */
+ if (used_cols & new_col)
+ new_col = PFalg_ori_name (
+ PFalg_unq_name (new_col, 0),
+ ~used_cols);
+
+ /* project away all columns except for the join column */
+ right[0] = PFalg_proj (new_col, p->sem.eqjoin.att2);
+
+ /* replace the semijoin */
+ *p = *PFla_project_ (
+ PFla_eqjoin (L(p),
+ PFla_distinct (
+ PFla_project_ (R(p), 1, right)),
+ p->sem.eqjoin.att1,
+ new_col),
+ p->schema.count,
+ top);
+ }
+
+ SEEN(p) = true;
+}
+
+/**
+ * This function removes all semijoin operators
+ * as the proxy introduction phases cannot cope
+ * with them.
+ */
+static void
+remove_semijoin_operators (PFla_op_t *root)
+{
+ assert (root->kind == la_serialize);
+
+ remove_semijoin_worker (root);
+ PFla_dag_reset (root);
+}
/**
*
@@ -277,9 +350,6 @@
p = proxy_entry;
- /* FIXME: This code hasn't been tested on semijoin operators */
- if (p->kind == la_semijoin) return CONFLICT;
-
/* This check is more restrictive as required... */
if (!PFprop_key_left (p->prop, p->sem.eqjoin.att1) ||
!PFprop_key_right (p->prop, p->sem.eqjoin.att2))
@@ -352,6 +422,12 @@
*
*/
+#define check_base(p) (((p)->kind == la_project && \
+ L((p))->kind == la_cross) || \
+ ((p)->kind == la_project && \
+ L((p))->kind == la_thetajoin) || \
+ ((p)->kind == la_cross) || \
+ ((p)->kind == la_thetajoin))
/**
* semijoin_entry checks the complete pattern
* -- see function modify_semijoin_proxy() for
@@ -362,8 +438,7 @@
{
PFla_op_t *lp, *rp;
- if (p->kind != la_eqjoin &&
- p->kind != la_semijoin)
+ if (p->kind != la_eqjoin)
return false;
lp = L(p);
@@ -375,27 +450,9 @@
if (rp->kind == la_project)
rp = L(rp);
- return ((p->kind == la_semijoin &&
- /* here we do not need to ensure that a distinct
- operator is present as duplicates are removed
- anyway */
- lp->kind == la_number &&
- L(lp)->kind == la_cross &&
- !lp->sem.number.part &&
- PFprop_subdom (p->prop,
- PFprop_dom_right (p->prop,
- p->sem.eqjoin.att2),
- PFprop_dom_left (p->prop,
- p->sem.eqjoin.att1)) &&
- PFprop_subdom (p->prop,
- PFprop_dom_left (p->prop,
- p->sem.eqjoin.att1),
- PFprop_dom (lp->prop,
- lp->sem.number.attname)))
- ||
- (lp->kind == la_distinct &&
+ return ((lp->kind == la_distinct &&
rp->kind == la_number &&
- L(rp)->kind == la_cross &&
+ check_base(L(rp)) &&
L(p)->schema.count == 1 &&
lp->schema.count == 1 &&
!rp->sem.number.part &&
@@ -412,7 +469,7 @@
||
(rp->kind == la_distinct &&
lp->kind == la_number &&
- L(lp)->kind == la_cross &&
+ check_base(L(lp)) &&
R(p)->schema.count == 1 &&
rp->schema.count == 1 &&
PFprop_subdom (p->prop,
@@ -436,7 +493,7 @@
{
PFla_op_t *lp, *rp;
- if (p->kind != la_number || L(p)->kind != la_cross)
+ if (p->kind != la_number || !check_base (L(p)))
return false;
lp = L(entry);
@@ -667,9 +724,10 @@
num_col_alias, num_col_alias1, num_col_alias2,
used_cols = 0;
unsigned int i, j;
- PFalg_proj_t *exit_proj, *proxy_proj, *dist_proj, *left_proj;
+ PFalg_proj_t *exit_proj, *proxy_proj, *dist_proj, *left_proj,
+ *opt_base_proj = NULL;
PFla_op_t *lp, *rp, *lproject = NULL, *rproject = NULL,
- *num1, *num2, *number, *new_number;
+ *num1, *num2, *number, *new_number, *lexit, *op;
/* do not introduce proxy if some conflicts remain */
if (!(leaf_ref = join_resolve_conflicts (proxy_entry,
@@ -700,35 +758,23 @@
lp = L(lp);
}
/* look for an project operator in the right equi-join branch */
- if (rp->kind == la_project &&
- proxy_entry->kind != la_semijoin) {
+ if (rp->kind == la_project) {
rproject = rp;
rp = L(rp);
}
/* normalize the pattern such that the distinct
operator always resides in the 'virtual' right side (rp). */
- if (lp->kind == la_distinct &&
- proxy_entry->kind != la_semijoin) {
+ if (lp->kind == la_distinct) {
rp = lp; /* we do not need the old 'rp' reference anymore */
lproject = rproject; /* we do not need the old 'rproject' anymore */
}
- /* in case our proxy entry is a semijoin
- we need to link the arguments of the right
- join argument differently. */
- if (proxy_entry->kind == la_semijoin) {
- assert (R(proxy_entry) == rp);
-
- join_att2 = proxy_entry->sem.eqjoin.att2;
- rp = rp;
- } else {
- assert (rp->kind == la_distinct);
-
- /* the name of the single distinct column */
- join_att2 = rp->schema.items[0].name;
- rp = L(rp);
- }
+ assert (rp->kind == la_distinct);
+
+ /* the name of the single distinct column */
+ join_att2 = rp->schema.items[0].name;
+ rp = L(rp);
/* we have now checked the additional requirements (conflicts
resolved or rewritable) and als have ensured that:
@@ -756,6 +802,17 @@
proxy_exit->schema.items[i].name);
}
+ lexit = L(proxy_exit);
+ if (lexit->kind == la_project) {
+ for (i = 0; i < L(lexit)->schema.count; i++)
+ used_cols = used_cols | L(lexit)->schema.items[i].name;
+ opt_base_proj = PFmalloc ((lexit->schema.count + 2) *
+ sizeof (PFalg_proj_t));
+ for (i = 0; i < lexit->sem.proj.count; i++)
+ opt_base_proj[i+2] = lexit->sem.proj.items[i];
+ lexit = L(lexit);
+ }
+
/* We mark the right join column used.
(It possibly has a different name as we discarded
renaming projections (rproject)) */
@@ -818,12 +875,50 @@
}
/* build the modified DAG */
- num1 = PFla_number (L(L(proxy_exit)), num_col1, att_NULL);
- num2 = PFla_number (R(L(proxy_exit)), num_col2, att_NULL);
+ num1 = PFla_number (L(lexit), num_col1, att_NULL);
+ num2 = PFla_number (R(lexit), num_col2, att_NULL);
- number = PFla_number (
- PFla_cross (num1, num2),
- num_col, att_NULL);
+ if (lexit->kind == la_cross)
+ op = PFla_cross (num1, num2);
+ else if (lexit->kind == la_thetajoin)
+ op = PFla_thetajoin (
+ num1,
+ num2,
+ lexit->sem.thetajoin.count,
+ lexit->sem.thetajoin.pred);
+ else
+ return false;
+
+ if (opt_base_proj) {
+ unsigned int count;
+ PFalg_proj_t *proj;
+
+ opt_base_proj[0] = PFalg_proj (num_col1, num_col1);
+ opt_base_proj[1] = PFalg_proj (num_col2, num_col2);
+ op = PFla_project_ (
+ op,
+ L(proxy_exit)->schema.count + 2,
+ opt_base_proj);
+
+ count = 0;
+ proj = PFmalloc (op->schema.count * sizeof (PFalg_proj_t));
+ for (i = 0; i < op->schema.count; i++)
+ if (PFprop_ocol (num1, op->sem.proj.items[i].old))
+ proj[count++] = op->sem.proj.items[i];
+
+ num1 = PFla_project_ (num1, count, proj);
+
+ count = 0;
+ proj = PFmalloc (op->schema.count * sizeof (PFalg_proj_t));
+ for (i = 0; i < op->schema.count; i++)
+ if (PFprop_ocol (num2, op->sem.proj.items[i].old))
+ proj[count++] = op->sem.proj.items[i];
+
+ num2 = PFla_project_ (num2, count, proj);
+
+ }
+
+ number = PFla_number (op, num_col, att_NULL);
new_number = PFla_number (
PFla_eqjoin (
@@ -1950,6 +2045,24 @@
}
assert (count == proxy_entry->schema.count);
+ /* We are certain that this join is only a mapping join
+ (see join_entry ()). Any input column (at the proxy base)
+ that is propagated along the left *and* the right side
+ of the equi-join and which is not modified contains
+ the same values and can be pruned from the entry_proj
+ projection list. Here we can prune one of its occurrences
+ in the above renaming projection list. */
+ for (i = 0; i < dist_count; i++)
+ for (j = i+1; j < dist_count; j++)
+ if (entry_proj[i].new == entry_proj[j].new) {
+ /* copy the last projection pair to the
+ current column that appears twice */
+ dist_count--;
+ entry_proj[i] = entry_proj[dist_count];
+ proxy_proj[i] = proxy_proj[dist_count];
+ i--;
+ break;
+ }
/* Create a proxy base with a projection that removes duplicate
columns on top of the proxy exit. (If this number operator
@@ -2037,7 +2150,7 @@
-
+#if 0
/**
*
* Functions specific to the nested proxies rewrite.
@@ -2681,7 +2794,7 @@
return true;
}
-
+#endif
@@ -2931,6 +3044,11 @@
{
PFarray_t *checked_nodes = PFarray (sizeof (PFla_op_t *));
+ /* remove all semijoin operators as most of our
+ proxies cannot cope with semijoins and thus
+ would recognize less proxies. */
+ remove_semijoin_operators (root);
+
/* find proxies and rewrite them in one go.
They are based on semi-join - number/rownum pairs */
if (intro_proxy_kind (root,
@@ -2988,6 +3106,10 @@
/* We require the superfluous number operators to be pruned. */
PFalgopt_icol (root);
+#if 0
+ /* We don't need the following rewrite anymore. All observed cases
+ are already handled by the thetajoin optimization. */
+
/* rewrite (directly) following proxies that are independent
of each other into an DAG that might evaluates both proxies
in parallel */
@@ -2997,6 +3119,7 @@
proxy_unnest_exit,
unnest_proxy,
checked_nodes);
+#endif
return root;
}
Index: Makefile.ag
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/Makefile.ag,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- Makefile.ag 3 Jan 2007 12:32:34 -0000 1.2
+++ Makefile.ag 7 May 2007 10:17:08 -0000 1.3
@@ -34,6 +34,7 @@
DIR = libdir
SOURCES = \
intro_proxy.c \
+ intro_thetajoin.c \
map_unq_names.c \
map_ori_names.c \
resolve_proxy.c
--- NEW FILE: intro_thetajoin.c ---
/**
* @file
*
* Introduce thetajoin operators.
*
* In this phase we transform equi-join operators into theta-joins.
* While both operators initally do the same the new theta-join operator
* allows us to create a new join rewrite strategy. Instead of pushing
* down theta-joins (like equi-joins) we move them as far up as possible.
*
* To choose which equi-join is transformed into a theta-join we start
* with the goal to transform a selection into a join predicate. Starting
* from selection operators we first find the corresponding comparison
* operator(s) (if any). These comparison operators than provide a left
* and a right join argument which we follow until the origin of the
* left and the right comparison argument is unified in a single column.
* At this place we have found the first common scope of both comparison
* sides. We now only have to transform the first equi-join operator on our
* way back to the root of the traversal (the select operator) into a
* theta-join operator.
* In a following rewrite phase the new theta-join operator will
* eventually become a real theta-join that has to evaluate a conjunctive
* join.
*
* Copyright Notice:
* -----------------
*
* The contents of this file are subject to the Pathfinder Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://monetdb.cwi.nl/Legal/PathfinderLicense-1.1.html
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is the Pathfinder system.
*
* The Original Code has initially been developed by the Database &
* Information Systems Group at the University of Konstanz, Germany and
* is now maintained by the Database Systems Group at the Technische
* Universitaet Muenchen, Germany. Portions created by the University of
* Konstanz and the Technische Universitaet Muenchen are Copyright (C)
* 2000-2005 University of Konstanz and (C) 2005-2007 Technische
* Universitaet Muenchen, respectively. All Rights Reserved.
*
* $Id: intro_thetajoin.c,v 1.1 2007/05/07 10:17:08 tsheyar Exp $
*/
/* always include pathfinder.h first! */
#include "pathfinder.h"
#include <assert.h>
#include <stdio.h>
#include "la_thetajoin.h"
#include "properties.h"
#include "mem.h" /* PFmalloc() */
#include "oops.h"
#include "alg_dag.h"
/*
* Easily access subtree-parts.
*/
/** starting from p, make a step left */
#define L(p) ((p)->child[0])
/** starting from p, make a step right */
#define R(p) ((p)->child[1])
#define SEEN(n) (n)->bit_dag
#define EDGE(n) (n)->state_label
#define BOOL_COLS(n) (n)->prop->icols
#define LEFT_COLS(n) (n)->prop->l_icols
#define RIGHT_COLS(n) (n)->prop->r_icols
/**
* Returns the intersection of an column list @a cols and a schema
* @a schema_ocols
*/
static PFalg_att_t
intersect_ocol (PFalg_att_t cols, PFalg_schema_t schema_ocols)
{
PFalg_att_t ocols = 0;
/* intersect attributes */
for (unsigned int j = 0; j < schema_ocols.count; j++)
ocols |= schema_ocols.items[j].name;
return cols & ocols;
}
/**
* Returns union of two column lists
*/
static PFalg_att_t
union_ (PFalg_att_t a, PFalg_att_t b)
{
return a | b;
}
/**
* Returns the difference of two column lists
*/
static PFalg_att_t
diff (PFalg_att_t a, PFalg_att_t b)
{
return a & (~b);
}
/**
* Worker function for find_join (). It traverses
* the DAG starting from a select operator and infers
* three different informations:
* * bool_cols: The boolean column name(s) consumed
* by the select operator. If a boolean marked column
* hits a comparison operator new left and right
* columns are introduced.
* * left_cols: Column (names) that are necessary to
* provide the left input of a comparison.
* * right_cols: Column (names) that are necessary to
* provide the right input of a comparison.
*
* This worker ends if the left and the right column lists
* overlap. In this case we have found something similar
* to the lowest common scope (where a split appeared).
*/
static bool
find_join_worker (PFla_op_t *n,
PFla_op_t **join,
PFalg_att_t bool_cols,
PFalg_att_t left_cols,
PFalg_att_t right_cols)
{
assert (n);
/* collect all input columns */
BOOL_COLS(n) = union_ (BOOL_COLS(n), bool_cols);
LEFT_COLS(n) = union_ (LEFT_COLS(n), left_cols);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), right_cols);
EDGE(n)++;
/* nothing to do if we haven't collected
all incoming column lists of that node */
if (EDGE(n) < PFprop_refctr (n))
return false;
/* remove all unnecessary columns from our structure */
BOOL_COLS(n) = intersect_ocol (BOOL_COLS(n), n->schema);
LEFT_COLS(n) = intersect_ocol (LEFT_COLS(n), n->schema);
RIGHT_COLS(n) = intersect_ocol (RIGHT_COLS(n), n->schema);
/* If both the left side and the right side of a comparison
reference the same column in a number operator
we have (hopefully) reached our goal. */
if (n->kind == la_number &&
LEFT_COLS(n) && RIGHT_COLS(n) &&
((LEFT_COLS(n) & RIGHT_COLS(n)) == LEFT_COLS(n) ||
(LEFT_COLS(n) & RIGHT_COLS(n)) == RIGHT_COLS(n)))
return true;
switch (n->kind) {
case la_serialize:
assert (!"this operator should never occur");
break;
case la_lit_tbl:
case la_empty_tbl:
break;
case la_attach:
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.attach.attname);
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.attach.attname);
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.attach.attname);
break;
case la_cross:
case la_semijoin:
case la_select:
case la_disjunion:
case la_intersect:
case la_difference:
case la_distinct:
break;
case la_eqjoin:
case la_thetajoin:
/* make sure to remember the last join node where we split
up the left and the right cols */
if ((intersect_ocol (LEFT_COLS(n), L(n)->schema) ^
intersect_ocol (LEFT_COLS(n), R(n)->schema))
&&
(intersect_ocol (RIGHT_COLS(n), L(n)->schema) ^
intersect_ocol (RIGHT_COLS(n), R(n)->schema))
&&
((intersect_ocol (LEFT_COLS(n), L(n)->schema) &&
intersect_ocol (RIGHT_COLS(n), R(n)->schema))
^
(intersect_ocol (RIGHT_COLS(n), L(n)->schema) &&
intersect_ocol (LEFT_COLS(n), R(n)->schema))))
*join = n;
break;
case la_project:
{
PFalg_att_t bool_cols = 0,
left_cols = 0,
right_cols = 0;
/* rename columns from new to old */
for (unsigned int i = 0; i < n->sem.proj.count; i++) {
if (BOOL_COLS(n) & n->sem.proj.items[i].new)
bool_cols = union_ (bool_cols, n->sem.proj.items[i].old);
if (LEFT_COLS(n) & n->sem.proj.items[i].new)
left_cols = union_ (left_cols, n->sem.proj.items[i].old);
if (RIGHT_COLS(n) & n->sem.proj.items[i].new)
right_cols = union_ (right_cols, n->sem.proj.items[i].old);
}
BOOL_COLS(n) = bool_cols;
LEFT_COLS(n) = left_cols;
RIGHT_COLS(n) = right_cols;
} break;
case la_fun_1to1:
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.fun_1to1.res);
/* ignore fn:contains() here as we do not know
how to implement it in a theta-join */
if (LEFT_COLS(n) & n->sem.fun_1to1.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.fun_1to1.res);
for (unsigned int i = 0; i < n->sem.fun_1to1.refs.count; i++)
LEFT_COLS(n) = union_ (LEFT_COLS(n),
n->sem.fun_1to1.refs.atts[i]);
}
if (RIGHT_COLS(n) & n->sem.fun_1to1.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.fun_1to1.res);
for (unsigned int i = 0; i < n->sem.fun_1to1.refs.count; i++)
RIGHT_COLS(n) = union_ (RIGHT_COLS(n),
n->sem.fun_1to1.refs.atts[i]);
}
break;
case la_num_eq:
case la_num_gt:
if (BOOL_COLS(n) & n->sem.binary.res) {
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.binary.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
}
if (LEFT_COLS(n) & n->sem.binary.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.binary.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
}
if (RIGHT_COLS(n) & n->sem.binary.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.binary.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
}
break;
case la_bool_and:
if (BOOL_COLS(n) & n->sem.binary.res) {
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.binary.res);
BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.binary.att1);
BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.binary.att2);
}
if (LEFT_COLS(n) & n->sem.binary.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.binary.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
}
if (RIGHT_COLS(n) & n->sem.binary.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.binary.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
}
break;
case la_bool_or:
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.binary.res);
if (LEFT_COLS(n) & n->sem.binary.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.binary.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
}
if (RIGHT_COLS(n) & n->sem.binary.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.binary.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
}
break;
case la_bool_not:
if (BOOL_COLS(n) & n->sem.unary.res) {
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.unary.res);
BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.unary.att);
}
if (LEFT_COLS(n) & n->sem.unary.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.unary.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.unary.att);
}
if (RIGHT_COLS(n) & n->sem.unary.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.unary.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.unary.att);
}
break;
case la_avg:
case la_max:
case la_min:
case la_sum:
case la_seqty1:
case la_all:
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.aggr.res);
if (LEFT_COLS(n) & n->sem.aggr.res)
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.aggr.att);
if (RIGHT_COLS(n) & n->sem.aggr.res)
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.aggr.att);
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.aggr.res);
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.aggr.res);
case la_count:
/* keep partition columns for LEFT_COLS and RIGHT_COLS */
/* BOOL_COLS can never occur as partitioning as there
are no boolean partition criteria */
break;
case la_rownum:
if (LEFT_COLS(n) & n->sem.rownum.attname) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.rownum.attname);
for (unsigned int i = 0;
i < PFord_count (n->sem.rownum.sortby);
i++)
LEFT_COLS(n) = union_ (LEFT_COLS(n),
PFord_order_col_at (
n->sem.rownum.sortby, i));
/* only infer part if available */
if (n->sem.rownum.part != att_NULL)
LEFT_COLS(n) = union_ (LEFT_COLS(n),
n->sem.rownum.part);
}
if (RIGHT_COLS(n) & n->sem.rownum.attname) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.rownum.attname);
for (unsigned int i = 0;
i < PFord_count (n->sem.rownum.sortby);
i++)
RIGHT_COLS(n) = union_ (RIGHT_COLS(n),
PFord_order_col_at (
n->sem.rownum.sortby, i));
/* only infer part if available */
if (n->sem.rownum.part != att_NULL)
RIGHT_COLS(n) = union_ (RIGHT_COLS(n),
n->sem.rownum.part);
}
break;
case la_number:
if (LEFT_COLS(n) & n->sem.number.attname && n->sem.number.part)
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.number.part);
if (RIGHT_COLS(n) & n->sem.number.attname && n->sem.number.part)
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.number.part);
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.number.attname);
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.number.attname);
break;
case la_type:
case la_cast:
BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.type.res);
if (LEFT_COLS(n) & n->sem.type.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.type.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.type.att);
}
if (RIGHT_COLS(n) & n->sem.type.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.type.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.type.att);
}
break;
case la_type_assert:
break;
case la_scjoin:
if (LEFT_COLS(n) & n->sem.scjoin.item_res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.scjoin.item_res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.scjoin.item);
}
if (RIGHT_COLS(n) & n->sem.scjoin.item_res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.scjoin.item_res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.scjoin.item);
}
break;
case la_dup_scjoin:
if (LEFT_COLS(n) & n->sem.scjoin.item_res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.scjoin.item_res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.scjoin.item);
}
if (RIGHT_COLS(n) & n->sem.scjoin.item_res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.scjoin.item_res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.scjoin.item);
}
break;
case la_doc_tbl:
if (LEFT_COLS(n) & n->sem.doc_tbl.item_res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.doc_tbl.item_res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.doc_tbl.item);
}
if (RIGHT_COLS(n) & n->sem.doc_tbl.item_res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.doc_tbl.item_res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.doc_tbl.item);
}
break;
case la_doc_access:
if (LEFT_COLS(n) & n->sem.doc_access.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.doc_access.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.doc_access.att);
}
if (RIGHT_COLS(n) & n->sem.doc_access.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.doc_access.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.doc_access.att);
}
break;
case la_element:
/* FIXME: for now we assume that a theta-join
cannot be pushed through an element constructor */
LEFT_COLS(n) = att_NULL;
RIGHT_COLS(n) = att_NULL;
break;
case la_element_tag:
break;
case la_attribute:
if (LEFT_COLS(n) & n->sem.attr.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.attr.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.attr.qn);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.attr.val);
}
if (RIGHT_COLS(n) & n->sem.attr.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.attr.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.attr.qn);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.attr.val);
}
break;
case la_textnode:
if (LEFT_COLS(n) & n->sem.textnode.res) {
LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.textnode.res);
LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.textnode.item);
}
if (RIGHT_COLS(n) & n->sem.textnode.res) {
RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.textnode.res);
RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.textnode.item);
}
break;
case la_docnode:
case la_comment:
case la_processi:
assert (!"not implemented yet?");
case la_merge_adjacent:
/* FIXME: for now we assume that a theta-join
cannot be pushed through an element constructor */
LEFT_COLS(n) = att_NULL;
RIGHT_COLS(n) = att_NULL;
break;
case la_roots:
break;
case la_frag_union:
case la_fragment:
case la_empty_frag:
/* for the fragment information we do not need to introduce
column names as the thetajoin can never be moved along
its edges. */
LEFT_COLS(n) = att_NULL;
RIGHT_COLS(n) = att_NULL;
break;
case la_cond_err:
case la_trace:
/* propagate input columns to the left child ... */
if (find_join_worker (
L(n),
join,
BOOL_COLS(n),
LEFT_COLS(n),
RIGHT_COLS(n)))
return true;
/* ... and propagate nothing to the right child */
if (find_join_worker (
R(n),
join,
0, 0, 0))
return true;
return false;
break;
case la_nil:
break;
case la_trace_msg:
/* this operator cannot be reached */
break;
case la_trace_map:
/* this operator cannot be reached */
break;
case la_rec_fix:
/* FIXME: for now we assume that a theta-join
cannot be pushed through a recursion operator */
LEFT_COLS(n) = att_NULL;
RIGHT_COLS(n) = att_NULL;
break;
case la_rec_param:
case la_rec_arg:
case la_rec_base:
break;
case la_proxy:
case la_proxy_base:
break;
case la_string_join:
/* propagate updated item column to the left child ... */
if (find_join_worker (
L(n),
join,
BOOL_COLS(n),
LEFT_COLS(n) ? n->sem.string_join.item : 0,
RIGHT_COLS(n) ? n->sem.string_join.item : 0))
return true;
/* ... and propagate the item separator to the right child */
if (find_join_worker (
R(n),
join,
BOOL_COLS(n),
LEFT_COLS(n) ? n->sem.string_join.item_sep : 0,
RIGHT_COLS(n) ? n->sem.string_join.item_sep : 0))
return true;
return false;
break;
case la_eqjoin_unq:
PFoops (OOPS_FATAL,
"clone column aware equi-join operator is "
"only allowed with unique names!");
case la_cross_mvd:
PFoops (OOPS_FATAL,
"clone column aware cross product operator is "
"only allowed inside mvd optimization!");
case la_dummy:
break;
}
/* infer properties for children */
if (L(n) && find_join_worker (L(n),
join,
BOOL_COLS(n),
LEFT_COLS(n),
RIGHT_COLS(n)))
return true;
if (R(n) && find_join_worker (R(n),
join,
BOOL_COLS(n),
LEFT_COLS(n),
RIGHT_COLS(n)))
return true;
return false;
}
/**
* Reset the property information.
*/
static void
reset_fun (PFla_op_t *n)
{
EDGE(n) = 0;
/* reset the referenced column information */
BOOL_COLS(n) = 0;
LEFT_COLS(n) = 0;
RIGHT_COLS(n) = 0;
}
/**
* For each select node we reset the property information
* and let a worker function find a join operator that can
* be rewritten.
*/
static PFla_op_t *
find_join (PFla_op_t *select)
{
PFla_op_t *join = NULL;
assert (select && select->kind == la_select);
/* collect number of incoming edges (parents) */
PFprop_infer_refctr (select);
/* reset the old property information */
PFprop_reset (select, reset_fun);
if (find_join_worker (L(select),
&join,
select->sem.select.att,
0,
0) &&
join &&
join->kind == la_eqjoin)
return join;
else
return NULL;
}
/**
* Rewrite an equi-join operator into a theta-join operator.
*/
static void
transform_join (PFla_op_t *join)
{
PFalg_sel_t *pred = PFmalloc (sizeof (PFalg_sel_t));
assert (join && join->kind == la_eqjoin);
pred[0] = PFalg_sel (alg_comp_eq,
join->sem.eqjoin.att1,
join->sem.eqjoin.att2);
*join = *PFla_thetajoin (L(join), R(join), 1, pred);
}
/**
* Collect all select nodes in our logical plan. (Starting
* from these nodes we then try to find places where we
* can introduce theta-join operators that (after optimization)
* might hopefully merge with the select nodes.)
*/
static void
collect_select_nodes (PFla_op_t *n, PFarray_t *select_nodes)
{
assert (n);
/* nothing to do if we already visited that node */
if (SEEN(n))
return;
/* infer properties for children */
for (unsigned int i = 0; i < PFLA_OP_MAXCHILD && n->child[i]; i++)
collect_select_nodes (n->child[i], select_nodes);
if (n->kind == la_select)
*(PFla_op_t **) PFarray_add (select_nodes) = n;
SEEN(n) = true;
}
/**
* Introduce theta-join operators.
*/
PFla_op_t *
PFintro_thetajoins (PFla_op_t *root)
{
PFarray_t *select_nodes = PFarray (sizeof (PFla_op_t *));
PFla_op_t *select, *join;
/* collect the select nodes */
collect_select_nodes (root, select_nodes);
PFla_dag_reset (root);
for (unsigned int i = 0; i < PFarray_last (select_nodes); i++) {
/* for each select node we try to find a corresponding join */
select = *(PFla_op_t **) PFarray_at (select_nodes, i);
assert (select->kind == la_select);
/* start the property inference (and find
as a side effect a matching join operator) */
if ((join = find_join (select)))
/* transform the equi-join into a theta-join */
transform_join (join);
}
return root;
}
/* vim:set shiftwidth=4 expandtab filetype=c: */
Index: map_ori_names.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/map_ori_names.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -d -r1.11 -r1.12
--- map_ori_names.c 15 Mar 2007 14:12:55 -0000 1.11
+++ map_ori_names.c 7 May 2007 10:17:12 -0000 1.12
@@ -329,6 +329,20 @@
p->sem.eqjoin.att2));
break;
+ case la_thetajoin:
+ {
+ PFalg_sel_t *pred = PFmalloc (p->sem.thetajoin.count *
+ sizeof (PFalg_sel_t));
+
+ for (unsigned int i = 0; i < p->sem.thetajoin.count; i++)
+ pred[i] = PFalg_sel (p->sem.thetajoin.pred[i].comp,
+ ONAME (p, p->sem.thetajoin.pred[i].left),
+ ONAME (p,
p->sem.thetajoin.pred[i].right));
+
+ res = thetajoin (PROJ(LEFT, p), PROJ(RIGHT, p),
+ p->sem.thetajoin.count, pred);
+ } break;
+
case la_project:
{
PFla_op_t *left;
Index: map_unq_names.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/map_unq_names.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -d -r1.12 -r1.13
--- map_unq_names.c 15 Mar 2007 14:12:55 -0000 1.12
+++ map_unq_names.c 7 May 2007 10:17:12 -0000 1.13
@@ -168,6 +168,7 @@
break;
case la_cross:
+ case la_thetajoin:
{
/* Add a projection for each operand to ensure
that all columns are renamed. */
@@ -210,9 +211,24 @@
left = PFla_project_ (left, count1, projlist1);
right = PFla_project_ (right, count2, projlist2);
- res = cross (left, right);
+ if (p->kind == la_cross)
+ res = cross (left, right);
+ else { /* p->kind == la_thetajoin */
+ PFalg_sel_t *pred = PFmalloc (p->sem.thetajoin.count *
+ sizeof (PFalg_sel_t));
+
+ for (unsigned int i = 0; i < p->sem.thetajoin.count; i++)
+ pred[i] = PFalg_sel (
+ p->sem.thetajoin.pred[i].comp,
+ UNAME (p, p->sem.thetajoin.pred[i].left),
+ UNAME (p, p->sem.thetajoin.pred[i].right));
+
+ res = thetajoin (left, right,
+ p->sem.thetajoin.count, pred);
+ }
} break;
+
case la_cross_mvd:
PFoops (OOPS_FATAL,
"clone column unaware cross product operator is "
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins