Update of /cvsroot/monetdb/pathfinder/compiler/algebra/map
In directory sc8-pr-cvs16:/tmp/cvs-serv26652/algebra/map

Modified Files:
        Makefile.ag intro_proxy.c map_ori_names.c map_unq_names.c 
Added Files:
        intro_thetajoin.c 
Log Message:
Thetajoin operator:

    Introduced a thetajoin operator in the logical algebra (and the physical
    algebra and the MIL generation). This new thetajoin can handle a list of
    conjunctive predicates where each predicate represents an (in-)equality
    condition.

Thetajoin introduction:

    A thetajoin operator is introduced either by a small pattern
    (select-comparison-cross) in opt_general.brg or by a new introduction
    phase in intro_thetajoin.c. The latter introduction phase uses a
    selection as a basis to find the *correct* equi-join and transforms
    this equi-join into a thetajoin (for more details please look into
    the code).

Thetajoin optimization:

    Similar to the MVD optimization phase that pushes cross product
    operators up in the DAG structure this checkin provides a new
    optimization phase that pushes thetajoin operators up in the DAG
    (opt_thetajoin.c). These rewrites result in thetajoin operators
    that contain multiple predicates. (E.g. the transformed equi-joins
    eventually hit the selection that triggered the rewrite and thus form
    a new scope-dependent value-based join -- see XMark Q3 and Q4.)

Additional changes:

    opt_general.brg
     * Extended thetajoin pattern to cope with inequality comparisons
     * Rewrite the pattern: semijoin (distinct (Rel), Rel) into
       distinct (semijoin (Rel, Rel) as the semijoin hopefully reduces
       the cardinality more than the distinct operator. (This speeds
       up XMark Q12 by a factor of 2.5.)

    intro_proxy.c
     * Added a preceding phase that removes all semijoin operators.
       (Using only simply checks more proxies are now detected.)
     * Removed unnest optimization phase (thetajoin optimization
       does the same in a more general scenario).

    physical.c
     * Toyed around with the cost model
     * Fixed 'PFord_order_dir_at' bug

    planner.c
     * Generated more ordered alternatives (even if an ordering is
       not required) after an operator is mapped to physical algebra.
       In some situations this avoids in a later step planning the same
       sort operator multiple times. (We only introduce orderings for
       iter-like columns and sort at most two columns. Otherwise the
       plan space explodes for some sample queries.)
     * Extended list of possible orderings for the rownum operator



Index: intro_proxy.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/intro_proxy.c,v
retrieving revision 1.24
retrieving revision 1.25
diff -u -d -r1.24 -r1.25
--- intro_proxy.c       2 Apr 2007 20:14:53 -0000       1.24
+++ intro_proxy.c       7 May 2007 10:17:08 -0000       1.25
@@ -64,8 +64,81 @@
 #define pfIN(p)  ((p)->bit_in)
 #define pfOUT(p) ((p)->bit_out)
 
+/**
+ * worker for remove_semijoin_operators() that
+ * replaces semijoin operators by equi-joins and
+ * a distinct.
+ */
+static void
+remove_semijoin_worker (PFla_op_t *p)
+{
+    assert (p);
+
+    /* nothing to do if we already visited that node */
+    if (SEEN(p))
+        return;
+
+    for (unsigned int i = 0; i < PFLA_OP_MAXCHILD && p->child[i]; i++)
+        remove_semijoin_worker (p->child[i]);
 
+    if (p->kind == la_semijoin) {
+        /* we need to additional projections:
+           top    to remove the right join argument from the result list
+           right  to avoid name conflicts with the left input columns
+                  and to ensure that distinct is applied only on the join
+                  column */
+        PFalg_proj_t *top   = PFmalloc (p->schema.count *
+                                        sizeof (PFalg_proj_t)),
+                     *right = PFmalloc (sizeof (PFalg_proj_t));
+        PFalg_att_t   used_cols = 0,
+                      new_col = p->sem.eqjoin.att2,
+                      cur_col;
 
+        /* fill the 'top' projection and collect all used columns
+           of the left input relation (equals the semijoin output
+           schema) */
+        for (unsigned int i = 0; i < p->schema.count; i++) {
+            cur_col = p->schema.items[i].name;
+            
+            top[i] = PFalg_proj (cur_col, cur_col);
+            used_cols |= cur_col;
+        }
+        /* rename the join argument in case a name conflict occurs */
+        if (used_cols & new_col)
+            new_col = PFalg_ori_name (
+                          PFalg_unq_name (new_col, 0),
+                          ~used_cols);
+        
+        /* project away all columns except for the join column */
+        right[0] = PFalg_proj (new_col, p->sem.eqjoin.att2);
+        
+        /* replace the semijoin */
+        *p = *PFla_project_ (
+                  PFla_eqjoin (L(p),
+                               PFla_distinct (
+                                   PFla_project_ (R(p), 1, right)),
+                               p->sem.eqjoin.att1,
+                               new_col),
+                  p->schema.count,
+                  top);
+    }
+
+    SEEN(p) = true;
+}
+
+/**
+ * This function removes all semijoin operators
+ * as the proxy introduction phases cannot cope
+ * with them.
+ */
+static void
+remove_semijoin_operators (PFla_op_t *root)
+{
+    assert (root->kind == la_serialize);
+
+    remove_semijoin_worker (root);
+    PFla_dag_reset (root);
+}
 
 /**
  *
@@ -277,9 +350,6 @@
 
     p = proxy_entry;
 
-    /* FIXME: This code hasn't been tested on semijoin operators */
-    if (p->kind == la_semijoin) return CONFLICT; 
-    
     /* This check is more restrictive as required... */
     if (!PFprop_key_left (p->prop, p->sem.eqjoin.att1) ||
         !PFprop_key_right (p->prop, p->sem.eqjoin.att2))
@@ -352,6 +422,12 @@
  *
  */
 
+#define check_base(p) (((p)->kind == la_project &&       \
+                        L((p))->kind == la_cross) ||     \
+                       ((p)->kind == la_project &&       \
+                        L((p))->kind == la_thetajoin) || \
+                       ((p)->kind == la_cross) ||        \
+                       ((p)->kind == la_thetajoin))
 /**
  * semijoin_entry checks the complete pattern
  * -- see function modify_semijoin_proxy() for
@@ -362,8 +438,7 @@
 {
     PFla_op_t *lp, *rp;
 
-    if (p->kind != la_eqjoin &&
-        p->kind != la_semijoin)
+    if (p->kind != la_eqjoin)
         return false;
 
     lp = L(p);
@@ -375,27 +450,9 @@
     if (rp->kind == la_project)
         rp = L(rp);
 
-    return ((p->kind == la_semijoin &&
-             /* here we do not need to ensure that a distinct
-                operator is present as duplicates are removed
-                anyway */
-             lp->kind == la_number &&
-             L(lp)->kind == la_cross &&
-             !lp->sem.number.part &&
-             PFprop_subdom (p->prop,
-                            PFprop_dom_right (p->prop,
-                                              p->sem.eqjoin.att2),
-                            PFprop_dom_left (p->prop,
-                                             p->sem.eqjoin.att1)) &&
-             PFprop_subdom (p->prop,
-                            PFprop_dom_left (p->prop,
-                                             p->sem.eqjoin.att1),
-                            PFprop_dom (lp->prop,
-                                        lp->sem.number.attname)))
-            ||
-            (lp->kind == la_distinct &&
+    return ((lp->kind == la_distinct &&
              rp->kind == la_number &&
-             L(rp)->kind == la_cross &&
+             check_base(L(rp)) &&
              L(p)->schema.count == 1 &&
              lp->schema.count == 1 &&
              !rp->sem.number.part &&
@@ -412,7 +469,7 @@
             ||
             (rp->kind == la_distinct &&
              lp->kind == la_number &&
-             L(lp)->kind == la_cross &&
+             check_base(L(lp)) &&
              R(p)->schema.count == 1 &&
              rp->schema.count == 1 &&
              PFprop_subdom (p->prop,
@@ -436,7 +493,7 @@
 {
     PFla_op_t *lp, *rp;
 
-    if (p->kind != la_number || L(p)->kind != la_cross)
+    if (p->kind != la_number || !check_base (L(p)))
         return false;
 
     lp = L(entry);
@@ -667,9 +724,10 @@
                 num_col_alias, num_col_alias1, num_col_alias2,
                 used_cols = 0;
     unsigned int i, j;
-    PFalg_proj_t *exit_proj, *proxy_proj, *dist_proj, *left_proj;
+    PFalg_proj_t *exit_proj, *proxy_proj, *dist_proj, *left_proj,
+                 *opt_base_proj = NULL;
     PFla_op_t *lp, *rp, *lproject = NULL, *rproject = NULL,
-              *num1, *num2, *number, *new_number;
+              *num1, *num2, *number, *new_number, *lexit, *op;
 
     /* do not introduce proxy if some conflicts remain */
     if (!(leaf_ref = join_resolve_conflicts (proxy_entry,
@@ -700,35 +758,23 @@
         lp = L(lp);
     }
     /* look for an project operator in the right equi-join branch */
-    if (rp->kind == la_project &&
-        proxy_entry->kind != la_semijoin) {
+    if (rp->kind == la_project) {
         rproject = rp;
         rp = L(rp);
     }
 
     /* normalize the pattern such that the distinct
        operator always resides in the 'virtual' right side (rp). */
-    if (lp->kind == la_distinct &&
-        proxy_entry->kind != la_semijoin) {
+    if (lp->kind == la_distinct) {
         rp = lp; /* we do not need the old 'rp' reference anymore */
         lproject = rproject; /* we do not need the old 'rproject' anymore */
     }
 
-    /* in case our proxy entry is a semijoin
-       we need to link the arguments of the right
-       join argument differently. */
-    if (proxy_entry->kind == la_semijoin) {
-        assert (R(proxy_entry) == rp);
-
-        join_att2 = proxy_entry->sem.eqjoin.att2;
-        rp = rp;
-    } else {
-        assert (rp->kind == la_distinct);
-        
-        /* the name of the single distinct column */
-        join_att2 = rp->schema.items[0].name;
-        rp = L(rp);
-    }
+    assert (rp->kind == la_distinct);
+    
+    /* the name of the single distinct column */
+    join_att2 = rp->schema.items[0].name;
+    rp = L(rp);
 
     /* we have now checked the additional requirements (conflicts
        resolved or rewritable) and als have ensured that:
@@ -756,6 +802,17 @@
                            proxy_exit->schema.items[i].name);
     }
 
+    lexit = L(proxy_exit);
+    if (lexit->kind == la_project) {
+        for (i = 0; i < L(lexit)->schema.count; i++)
+            used_cols = used_cols | L(lexit)->schema.items[i].name;
+        opt_base_proj = PFmalloc ((lexit->schema.count + 2) *
+                                  sizeof (PFalg_proj_t));
+        for (i = 0; i < lexit->sem.proj.count; i++)
+            opt_base_proj[i+2] = lexit->sem.proj.items[i];
+        lexit = L(lexit);
+    }
+
     /* We mark the right join column used.
        (It possibly has a different name as we discarded
         renaming projections (rproject)) */
@@ -818,12 +875,50 @@
     }
 
     /* build the modified DAG */
-    num1 = PFla_number (L(L(proxy_exit)), num_col1, att_NULL);
-    num2 = PFla_number (R(L(proxy_exit)), num_col2, att_NULL);
+    num1 = PFla_number (L(lexit), num_col1, att_NULL);
+    num2 = PFla_number (R(lexit), num_col2, att_NULL);
 
-    number = PFla_number (
-                 PFla_cross (num1, num2),
-                 num_col, att_NULL);
+    if (lexit->kind == la_cross)
+        op = PFla_cross (num1, num2);
+    else if (lexit->kind == la_thetajoin)
+        op = PFla_thetajoin (
+                 num1,
+                 num2,
+                 lexit->sem.thetajoin.count,
+                 lexit->sem.thetajoin.pred);
+    else
+        return false;
+
+    if (opt_base_proj) {
+        unsigned int count;
+        PFalg_proj_t *proj;
+        
+        opt_base_proj[0] = PFalg_proj (num_col1, num_col1);
+        opt_base_proj[1] = PFalg_proj (num_col2, num_col2);
+        op = PFla_project_ (
+                 op,
+                 L(proxy_exit)->schema.count + 2,
+                 opt_base_proj);
+
+        count = 0;
+        proj = PFmalloc (op->schema.count * sizeof (PFalg_proj_t));
+        for (i = 0; i < op->schema.count; i++)
+            if (PFprop_ocol (num1, op->sem.proj.items[i].old))
+                proj[count++] = op->sem.proj.items[i];
+
+        num1 = PFla_project_ (num1, count, proj);
+                
+        count = 0;
+        proj = PFmalloc (op->schema.count * sizeof (PFalg_proj_t));
+        for (i = 0; i < op->schema.count; i++)
+            if (PFprop_ocol (num2, op->sem.proj.items[i].old))
+                proj[count++] = op->sem.proj.items[i];
+
+        num2 = PFla_project_ (num2, count, proj);
+                
+    }
+    
+    number = PFla_number (op, num_col, att_NULL);
 
     new_number = PFla_number (
                      PFla_eqjoin (
@@ -1950,6 +2045,24 @@
     }
     assert (count == proxy_entry->schema.count);
 
+    /* We are certain that this join is only a mapping join 
+       (see join_entry ()). Any input column (at the proxy base)
+       that is propagated along the left *and* the right side
+       of the equi-join and which is not modified contains 
+       the same values and can be pruned from the entry_proj
+       projection list. Here we can prune one of its occurrences
+       in the above renaming projection list. */       
+    for (i = 0; i < dist_count; i++)
+        for (j = i+1; j < dist_count; j++)
+            if (entry_proj[i].new == entry_proj[j].new) {
+                /* copy the last projection pair to the
+                   current column that appears twice */
+                dist_count--;
+                entry_proj[i] = entry_proj[dist_count];
+                proxy_proj[i] = proxy_proj[dist_count];
+                i--;
+                break;
+            }
 
     /* Create a proxy base with a projection that removes duplicate
        columns on top of the proxy exit. (If this number operator
@@ -2037,7 +2150,7 @@
 
 
 
-
+#if 0
 /**
  *
  * Functions specific to the nested proxies rewrite.
@@ -2681,7 +2794,7 @@
 
     return true;
 }
-
+#endif
 
 
 
@@ -2931,6 +3044,11 @@
 {
     PFarray_t *checked_nodes = PFarray (sizeof (PFla_op_t *));
 
+    /* remove all semijoin operators as most of our
+       proxies cannot cope with semijoins and thus
+       would recognize less proxies. */
+    remove_semijoin_operators (root);
+
     /* find proxies and rewrite them in one go.
        They are based on semi-join - number/rownum pairs */
     if (intro_proxy_kind (root,
@@ -2988,6 +3106,10 @@
     /* We require the superfluous number operators to be pruned. */
     PFalgopt_icol (root);
 
+#if 0
+    /* We don't need the following rewrite anymore. All observed cases
+       are already handled by the thetajoin optimization. */
+
     /* rewrite (directly) following proxies that are independent
        of each other into an DAG that might evaluates both proxies
        in parallel */
@@ -2997,6 +3119,7 @@
                       proxy_unnest_exit,
                       unnest_proxy,
                       checked_nodes);
+#endif
 
     return root;
 }

Index: Makefile.ag
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/Makefile.ag,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- Makefile.ag 3 Jan 2007 12:32:34 -0000       1.2
+++ Makefile.ag 7 May 2007 10:17:08 -0000       1.3
@@ -34,6 +34,7 @@
         DIR = libdir
         SOURCES = \
                 intro_proxy.c \
+                intro_thetajoin.c \
                 map_unq_names.c \
                 map_ori_names.c \
                 resolve_proxy.c

--- NEW FILE: intro_thetajoin.c ---
/**
 * @file
 *
 * Introduce thetajoin operators.
 *
 * In this phase we transform equi-join operators into theta-joins.
 * While both operators initally do the same the new theta-join operator
 * allows us to create a new join rewrite strategy. Instead of pushing
 * down theta-joins (like equi-joins) we move them as far up as possible.
 *
 * To choose which equi-join is transformed into a theta-join we start
 * with the goal to transform a selection into a join predicate. Starting
 * from selection operators we first find the corresponding comparison 
 * operator(s) (if any). These comparison operators than provide a left
 * and a right join argument which we follow until the origin of the
 * left and the right comparison argument is unified in a single column.
 *    At this place we have found the first common scope of both comparison
 * sides. We now only have to transform the first equi-join operator on our
 * way back to the root of the traversal (the select operator) into a
 * theta-join operator.
 *    In a following rewrite phase the new theta-join operator will
 * eventually become a real theta-join that has to evaluate a conjunctive
 * join.
 *
 * Copyright Notice:
 * -----------------
 *
 * The contents of this file are subject to the Pathfinder Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License.  You may obtain a copy of the License at
 * http://monetdb.cwi.nl/Legal/PathfinderLicense-1.1.html
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is the Pathfinder system.
 *
 * The Original Code has initially been developed by the Database &
 * Information Systems Group at the University of Konstanz, Germany and
 * is now maintained by the Database Systems Group at the Technische
 * Universitaet Muenchen, Germany.  Portions created by the University of
 * Konstanz and the Technische Universitaet Muenchen are Copyright (C)
 * 2000-2005 University of Konstanz and (C) 2005-2007 Technische
 * Universitaet Muenchen, respectively.  All Rights Reserved.
 *
 * $Id: intro_thetajoin.c,v 1.1 2007/05/07 10:17:08 tsheyar Exp $
 */

/* always include pathfinder.h first! */
#include "pathfinder.h"
#include <assert.h>
#include <stdio.h>

#include "la_thetajoin.h"
#include "properties.h"
#include "mem.h"          /* PFmalloc() */
#include "oops.h"
#include "alg_dag.h"

/*
 * Easily access subtree-parts.
 */
/** starting from p, make a step left */
#define L(p) ((p)->child[0])
/** starting from p, make a step right */
#define R(p) ((p)->child[1])

#define SEEN(n)       (n)->bit_dag
#define EDGE(n)       (n)->state_label

#define BOOL_COLS(n)  (n)->prop->icols
#define LEFT_COLS(n)  (n)->prop->l_icols
#define RIGHT_COLS(n) (n)->prop->r_icols

/**
 * Returns the intersection of an column list @a cols and a schema
 * @a schema_ocols
 */
static PFalg_att_t
intersect_ocol (PFalg_att_t cols, PFalg_schema_t schema_ocols)
{
    PFalg_att_t ocols = 0;

    /* intersect attributes */
    for (unsigned int j = 0; j < schema_ocols.count; j++)
        ocols |= schema_ocols.items[j].name;

    return cols & ocols;
}

/**
 * Returns union of two column lists
 */
static PFalg_att_t
union_ (PFalg_att_t a, PFalg_att_t b)
{
    return a | b;
}

/**
 * Returns the difference of two column lists
 */
static PFalg_att_t
diff (PFalg_att_t a, PFalg_att_t b)
{
    return a & (~b);
}

/**
 * Worker function for find_join (). It traverses
 * the DAG starting from a select operator and infers
 * three different informations:
 *  * bool_cols: The boolean column name(s) consumed
 *    by the select operator. If a boolean marked column
 *    hits a comparison operator new left and right
 *    columns are introduced.
 *  * left_cols: Column (names) that are necessary to
 *    provide the left input of a comparison.
 *  * right_cols: Column (names) that are necessary to
 *    provide the right input of a comparison.
 *
 * This worker ends if the left and the right column lists
 * overlap. In this case we have found something similar
 * to the lowest common scope (where a split appeared).
 */
static bool
find_join_worker (PFla_op_t *n,
                  PFla_op_t **join,
                  PFalg_att_t bool_cols,
                  PFalg_att_t left_cols,
                  PFalg_att_t right_cols)
{
    assert (n);
    
    /* collect all input columns */
    BOOL_COLS(n)  = union_ (BOOL_COLS(n), bool_cols);
    LEFT_COLS(n)  = union_ (LEFT_COLS(n), left_cols);
    RIGHT_COLS(n) = union_ (RIGHT_COLS(n), right_cols);
    
    EDGE(n)++;
    /* nothing to do if we haven't collected
       all incoming column lists of that node */
    if (EDGE(n) < PFprop_refctr (n))
        return false;

    /* remove all unnecessary columns from our structure */
    BOOL_COLS(n)  = intersect_ocol (BOOL_COLS(n), n->schema);
    LEFT_COLS(n)  = intersect_ocol (LEFT_COLS(n), n->schema);
    RIGHT_COLS(n) = intersect_ocol (RIGHT_COLS(n), n->schema);

    /* If both the left side and the right side of a comparison
       reference the same column in a number operator 
       we have (hopefully) reached our goal. */
    if (n->kind == la_number &&
        LEFT_COLS(n) && RIGHT_COLS(n) &&
        ((LEFT_COLS(n) & RIGHT_COLS(n)) == LEFT_COLS(n) ||
         (LEFT_COLS(n) & RIGHT_COLS(n)) == RIGHT_COLS(n)))
        return true;
    
    switch (n->kind) {
        case la_serialize:
            assert (!"this operator should never occur");
            break;

        case la_lit_tbl:
        case la_empty_tbl:
            break;

        case la_attach:
            BOOL_COLS(n)  = diff (BOOL_COLS(n),  n->sem.attach.attname);
            LEFT_COLS(n)  = diff (LEFT_COLS(n),  n->sem.attach.attname);
            RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.attach.attname);
            break;

        case la_cross:
        case la_semijoin:
        case la_select:
        case la_disjunion:
        case la_intersect:
        case la_difference:
        case la_distinct:
            break;

        case la_eqjoin:
        case la_thetajoin:
            /* make sure to remember the last join node where we split
               up the left and the right cols */
            if ((intersect_ocol (LEFT_COLS(n), L(n)->schema) ^
                 intersect_ocol (LEFT_COLS(n), R(n)->schema))
                &&
                (intersect_ocol (RIGHT_COLS(n), L(n)->schema) ^
                 intersect_ocol (RIGHT_COLS(n), R(n)->schema))
                &&
                ((intersect_ocol (LEFT_COLS(n), L(n)->schema) &&
                  intersect_ocol (RIGHT_COLS(n), R(n)->schema))
                 ^
                 (intersect_ocol (RIGHT_COLS(n), L(n)->schema) &&
                  intersect_ocol (LEFT_COLS(n), R(n)->schema))))
                *join = n;
            break;

        case la_project:
        {
            PFalg_att_t bool_cols  = 0,
                        left_cols  = 0,
                        right_cols = 0;
            
            /* rename columns from new to old */
            for (unsigned int i = 0; i < n->sem.proj.count; i++) {
                if (BOOL_COLS(n) & n->sem.proj.items[i].new)
                    bool_cols = union_ (bool_cols, n->sem.proj.items[i].old);
                if (LEFT_COLS(n) & n->sem.proj.items[i].new)
                    left_cols = union_ (left_cols, n->sem.proj.items[i].old);
                if (RIGHT_COLS(n) & n->sem.proj.items[i].new)
                    right_cols = union_ (right_cols, n->sem.proj.items[i].old);
            }
            BOOL_COLS(n)  = bool_cols;
            LEFT_COLS(n)  = left_cols;
            RIGHT_COLS(n) = right_cols;
        }   break;

        case la_fun_1to1:
            BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.fun_1to1.res);
            /* ignore fn:contains() here as we do not know
               how to implement it in a theta-join */
            
            if (LEFT_COLS(n) & n->sem.fun_1to1.res) {
                LEFT_COLS(n) = diff (LEFT_COLS(n), n->sem.fun_1to1.res);
                for (unsigned int i = 0; i < n->sem.fun_1to1.refs.count; i++)
                    LEFT_COLS(n) = union_ (LEFT_COLS(n), 
                                           n->sem.fun_1to1.refs.atts[i]);
            }

            if (RIGHT_COLS(n) & n->sem.fun_1to1.res) {
                RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.fun_1to1.res);
                for (unsigned int i = 0; i < n->sem.fun_1to1.refs.count; i++)
                    RIGHT_COLS(n) = union_ (RIGHT_COLS(n), 
                                            n->sem.fun_1to1.refs.atts[i]);
            }
            break;

        case la_num_eq:
        case la_num_gt:
            if (BOOL_COLS(n) & n->sem.binary.res) {
                BOOL_COLS(n)  = diff   (BOOL_COLS(n),  n->sem.binary.res);
                LEFT_COLS(n)  = union_ (LEFT_COLS(n),  n->sem.binary.att1);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
            }
            
            if (LEFT_COLS(n) & n->sem.binary.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.binary.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
            }

            if (RIGHT_COLS(n) & n->sem.binary.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.binary.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
            }
            break;

        case la_bool_and:
            if (BOOL_COLS(n) & n->sem.binary.res) {
                BOOL_COLS(n) = diff   (BOOL_COLS(n), n->sem.binary.res);
                BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.binary.att1);
                BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.binary.att2);
            }
            
            if (LEFT_COLS(n) & n->sem.binary.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.binary.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
            }

            if (RIGHT_COLS(n) & n->sem.binary.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.binary.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
            }
            break;
            
        case la_bool_or:
            BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.binary.res);
            
            if (LEFT_COLS(n) & n->sem.binary.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.binary.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att1);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.binary.att2);
            }

            if (RIGHT_COLS(n) & n->sem.binary.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.binary.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att1);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.binary.att2);
            }
            break;

        case la_bool_not:
            if (BOOL_COLS(n) & n->sem.unary.res) {
                BOOL_COLS(n) = diff   (BOOL_COLS(n), n->sem.unary.res);
                BOOL_COLS(n) = union_ (BOOL_COLS(n), n->sem.unary.att);
            }
            
            if (LEFT_COLS(n) & n->sem.unary.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.unary.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.unary.att);
            }

            if (RIGHT_COLS(n) & n->sem.unary.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.unary.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.unary.att);
            }
            break;

        case la_avg:
        case la_max:
        case la_min:
        case la_sum:
        case la_seqty1:
        case la_all:
            BOOL_COLS(n)  = diff (BOOL_COLS(n),  n->sem.aggr.res);

            if (LEFT_COLS(n) & n->sem.aggr.res)
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.aggr.att);
            if (RIGHT_COLS(n) & n->sem.aggr.res)
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.aggr.att);
            LEFT_COLS(n)  = diff (LEFT_COLS(n),  n->sem.aggr.res);
            RIGHT_COLS(n) = diff (RIGHT_COLS(n), n->sem.aggr.res);

        case la_count:
            /* keep partition columns for LEFT_COLS and RIGHT_COLS */

            /* BOOL_COLS can never occur as partitioning as there
               are no boolean partition criteria */
            break;

        case la_rownum:
            if (LEFT_COLS(n) & n->sem.rownum.attname) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.rownum.attname);

                for (unsigned int i = 0;
                     i < PFord_count (n->sem.rownum.sortby);
                     i++)
                    LEFT_COLS(n) = union_ (LEFT_COLS(n),
                                           PFord_order_col_at (
                                               n->sem.rownum.sortby, i));

                /* only infer part if available */
                if (n->sem.rownum.part != att_NULL)
                    LEFT_COLS(n) = union_ (LEFT_COLS(n),
                                           n->sem.rownum.part);
            }

            if (RIGHT_COLS(n) & n->sem.rownum.attname) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.rownum.attname);
                
                for (unsigned int i = 0;
                     i < PFord_count (n->sem.rownum.sortby);
                     i++)
                    RIGHT_COLS(n) = union_ (RIGHT_COLS(n),
                                            PFord_order_col_at (
                                                n->sem.rownum.sortby, i));

                /* only infer part if available */
                if (n->sem.rownum.part != att_NULL)
                    RIGHT_COLS(n) = union_ (RIGHT_COLS(n),
                                            n->sem.rownum.part);
            }
            break;

        case la_number:
            if (LEFT_COLS(n) & n->sem.number.attname && n->sem.number.part)
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.number.part);
            if (RIGHT_COLS(n) & n->sem.number.attname && n->sem.number.part)
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.number.part);

            LEFT_COLS(n)  = diff   (LEFT_COLS(n), n->sem.number.attname);
            RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.number.attname);
            break;

        case la_type:
        case la_cast:
            BOOL_COLS(n) = diff (BOOL_COLS(n), n->sem.type.res);

            if (LEFT_COLS(n) & n->sem.type.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.type.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.type.att);
            }

            if (RIGHT_COLS(n) & n->sem.type.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.type.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.type.att);
            }
            break;

        case la_type_assert:
            break;

        case la_scjoin:
            if (LEFT_COLS(n) & n->sem.scjoin.item_res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.scjoin.item_res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.scjoin.item);
            }
            if (RIGHT_COLS(n) & n->sem.scjoin.item_res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.scjoin.item_res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.scjoin.item);
            }
            break;

        case la_dup_scjoin:
            if (LEFT_COLS(n) & n->sem.scjoin.item_res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.scjoin.item_res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.scjoin.item);
            }
            if (RIGHT_COLS(n) & n->sem.scjoin.item_res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.scjoin.item_res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.scjoin.item);
            }
            break;

        case la_doc_tbl:
            if (LEFT_COLS(n) & n->sem.doc_tbl.item_res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.doc_tbl.item_res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.doc_tbl.item);
            }
            if (RIGHT_COLS(n) & n->sem.doc_tbl.item_res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.doc_tbl.item_res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.doc_tbl.item);
            }
            break;

        case la_doc_access:
            if (LEFT_COLS(n) & n->sem.doc_access.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.doc_access.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.doc_access.att);
            }
            if (RIGHT_COLS(n) & n->sem.doc_access.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.doc_access.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.doc_access.att);
            }
            break;

        case la_element:
            /* FIXME: for now we assume that a theta-join 
               cannot be pushed through an element constructor */
            LEFT_COLS(n)  = att_NULL;
            RIGHT_COLS(n) = att_NULL;
            break;

        case la_element_tag:
            break;

        case la_attribute:
            if (LEFT_COLS(n) & n->sem.attr.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.attr.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.attr.qn);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.attr.val);
            }
            if (RIGHT_COLS(n) & n->sem.attr.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.attr.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.attr.qn);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.attr.val);
            }
            break;

        case la_textnode:
            if (LEFT_COLS(n) & n->sem.textnode.res) {
                LEFT_COLS(n) = diff   (LEFT_COLS(n), n->sem.textnode.res);
                LEFT_COLS(n) = union_ (LEFT_COLS(n), n->sem.textnode.item);
            }
            if (RIGHT_COLS(n) & n->sem.textnode.res) {
                RIGHT_COLS(n) = diff   (RIGHT_COLS(n), n->sem.textnode.res);
                RIGHT_COLS(n) = union_ (RIGHT_COLS(n), n->sem.textnode.item);
            }
            break;

        case la_docnode:
        case la_comment:
        case la_processi:
            assert (!"not implemented yet?");

        case la_merge_adjacent:
            /* FIXME: for now we assume that a theta-join 
               cannot be pushed through an element constructor */
            LEFT_COLS(n)  = att_NULL;
            RIGHT_COLS(n) = att_NULL;
            break;

        case la_roots:
            break;

        case la_frag_union:
        case la_fragment:
        case la_empty_frag:
            /* for the fragment information we do not need to introduce
               column names as the thetajoin can never be moved along 
               its edges. */
            LEFT_COLS(n)  = att_NULL;
            RIGHT_COLS(n) = att_NULL;
            break;

        case la_cond_err:
        case la_trace:
            /* propagate input columns to the left child ... */
            if (find_join_worker (
                    L(n),
                    join,
                    BOOL_COLS(n),
                    LEFT_COLS(n),
                    RIGHT_COLS(n)))
                return true;
            
            /* ... and propagate nothing to the right child */
            if (find_join_worker (
                    R(n),
                    join,
                    0, 0, 0))
                return true;

            return false;
            break;

        case la_nil:
            break;

        case la_trace_msg:
            /* this operator cannot be reached */
            break;

        case la_trace_map:
            /* this operator cannot be reached */
            break;
            
        case la_rec_fix:
            /* FIXME: for now we assume that a theta-join 
               cannot be pushed through a recursion operator */
            LEFT_COLS(n)  = att_NULL;
            RIGHT_COLS(n) = att_NULL;
            break;
            
        case la_rec_param:
        case la_rec_arg:
        case la_rec_base:
            break;
            
        case la_proxy:
        case la_proxy_base:
            break;

        case la_string_join:
            /* propagate updated item column to the left child ... */
            if (find_join_worker (
                    L(n),
                    join,
                    BOOL_COLS(n),
                    LEFT_COLS(n) ? n->sem.string_join.item : 0,
                    RIGHT_COLS(n) ? n->sem.string_join.item : 0))
                return true;
            
            /* ... and propagate the item separator to the right child */
            if (find_join_worker (
                    R(n),
                    join,
                    BOOL_COLS(n),
                    LEFT_COLS(n) ? n->sem.string_join.item_sep : 0,
                    RIGHT_COLS(n) ? n->sem.string_join.item_sep : 0))
                return true;

            return false;
            break;
            
        case la_eqjoin_unq:
            PFoops (OOPS_FATAL,
                    "clone column aware equi-join operator is "
                    "only allowed with unique names!");

        case la_cross_mvd:
            PFoops (OOPS_FATAL,
                    "clone column aware cross product operator is "
                    "only allowed inside mvd optimization!");
            
        case la_dummy:
            break;
    }

    /* infer properties for children */
    if (L(n) && find_join_worker (L(n),
                                  join,
                                  BOOL_COLS(n),
                                  LEFT_COLS(n),
                                  RIGHT_COLS(n)))
        return true;

    if (R(n) && find_join_worker (R(n),
                                  join,
                                  BOOL_COLS(n),
                                  LEFT_COLS(n),
                                  RIGHT_COLS(n)))
        return true;
        
    return false;
}

/**
 * Reset the property information.
 */
static void
reset_fun (PFla_op_t *n)
{
    EDGE(n) = 0;

    /* reset the referenced column information */
    BOOL_COLS(n)  = 0;
    LEFT_COLS(n)  = 0;
    RIGHT_COLS(n) = 0;
}

/**
 * For each select node we reset the property information
 * and let a worker function find a join operator that can
 * be rewritten.
 */
static PFla_op_t *
find_join (PFla_op_t *select)
{
    PFla_op_t   *join = NULL;

    assert (select && select->kind == la_select);

    /* collect number of incoming edges (parents) */
    PFprop_infer_refctr (select);

    /* reset the old property information */
    PFprop_reset (select, reset_fun);

    if (find_join_worker (L(select),
                          &join,
                          select->sem.select.att,
                          0,
                          0) &&
        join &&
        join->kind == la_eqjoin)
        return join;
    else
        return NULL;
}

/**
 * Rewrite an equi-join operator into a theta-join operator.
 */
static void
transform_join (PFla_op_t *join)
{
    PFalg_sel_t *pred = PFmalloc (sizeof (PFalg_sel_t));

    assert (join && join->kind == la_eqjoin);

    pred[0] = PFalg_sel (alg_comp_eq,
                         join->sem.eqjoin.att1,
                         join->sem.eqjoin.att2);

    *join = *PFla_thetajoin (L(join), R(join), 1, pred);
}

/**
 * Collect all select nodes in our logical plan. (Starting
 * from these nodes we then try to find places where we
 * can introduce theta-join operators that (after optimization)
 * might hopefully merge with the select nodes.)
 */
static void
collect_select_nodes (PFla_op_t *n, PFarray_t *select_nodes)
{
    assert (n);

    /* nothing to do if we already visited that node */
    if (SEEN(n))
        return;

    /* infer properties for children */
    for (unsigned int i = 0; i < PFLA_OP_MAXCHILD && n->child[i]; i++)
        collect_select_nodes (n->child[i], select_nodes);

    if (n->kind == la_select)
        *(PFla_op_t **) PFarray_add (select_nodes) = n;

    SEEN(n) = true;
}

/**
 * Introduce theta-join operators.
 */
PFla_op_t *
PFintro_thetajoins (PFla_op_t *root)
{
    PFarray_t *select_nodes = PFarray (sizeof (PFla_op_t *));
    PFla_op_t *select, *join;
    
    /* collect the select nodes */
    collect_select_nodes (root, select_nodes);
    PFla_dag_reset (root);

    for (unsigned int i = 0; i < PFarray_last (select_nodes); i++) {
        /* for each select node we try to find a corresponding join */
        select = *(PFla_op_t **) PFarray_at (select_nodes, i);
        
        assert (select->kind == la_select);

        /* start the property inference (and find 
           as a side effect a matching join operator) */
        if ((join = find_join (select)))
            /* transform the equi-join into a theta-join */
            transform_join (join);
    }

    return root;
}

/* vim:set shiftwidth=4 expandtab filetype=c: */

Index: map_ori_names.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/map_ori_names.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -d -r1.11 -r1.12
--- map_ori_names.c     15 Mar 2007 14:12:55 -0000      1.11
+++ map_ori_names.c     7 May 2007 10:17:12 -0000       1.12
@@ -329,6 +329,20 @@
                                                    p->sem.eqjoin.att2));
             break;
 
+        case la_thetajoin:
+        {
+            PFalg_sel_t *pred = PFmalloc (p->sem.thetajoin.count *
+                                          sizeof (PFalg_sel_t));
+
+            for (unsigned int i = 0; i < p->sem.thetajoin.count; i++)
+                pred[i] = PFalg_sel (p->sem.thetajoin.pred[i].comp,
+                                     ONAME (p, p->sem.thetajoin.pred[i].left),
+                                     ONAME (p, 
p->sem.thetajoin.pred[i].right));
+
+            res = thetajoin (PROJ(LEFT, p), PROJ(RIGHT, p),
+                             p->sem.thetajoin.count, pred);
+        }   break;
+
         case la_project:
         {
             PFla_op_t *left;

Index: map_unq_names.c
===================================================================
RCS file: /cvsroot/monetdb/pathfinder/compiler/algebra/map/map_unq_names.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -d -r1.12 -r1.13
--- map_unq_names.c     15 Mar 2007 14:12:55 -0000      1.12
+++ map_unq_names.c     7 May 2007 10:17:12 -0000       1.13
@@ -168,6 +168,7 @@
             break;
 
         case la_cross:
+        case la_thetajoin:
         {
             /* Add a projection for each operand to ensure
                that all columns are renamed. */
@@ -210,9 +211,24 @@
             left  = PFla_project_ (left, count1, projlist1);
             right = PFla_project_ (right, count2, projlist2);
                                 
-            res = cross (left, right);
+            if (p->kind == la_cross)
+                res = cross (left, right);
+            else { /* p->kind == la_thetajoin */
+                PFalg_sel_t *pred = PFmalloc (p->sem.thetajoin.count *
+                                              sizeof (PFalg_sel_t));
+
+                for (unsigned int i = 0; i < p->sem.thetajoin.count; i++)
+                    pred[i] = PFalg_sel (
+                                  p->sem.thetajoin.pred[i].comp,
+                                  UNAME (p, p->sem.thetajoin.pred[i].left),
+                                  UNAME (p, p->sem.thetajoin.pred[i].right));
+
+                res = thetajoin (left, right,
+                                 p->sem.thetajoin.count, pred);
+            }
         }   break;
 
+
         case la_cross_mvd:
             PFoops (OOPS_FATAL,
                     "clone column unaware cross product operator is "


-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
Monetdb-pf-checkins mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/monetdb-pf-checkins

Reply via email to