Changeset: b071fd025cf9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b071fd025cf9
Modified Files:
        MonetDB5/conf/monetdb5.conf.in
        MonetDB5/src/optimizer/opt_derivepath.mx
Branch: default
Log Message:

Framework for derive path runtime optimization.
The sequence of derives can be changed by looking at the run time
properties, e.g. sortedness or histogram samples.


diffs (280 lines):

diff -r c725ce4f2e0c -r b071fd025cf9 MonetDB5/conf/monetdb5.conf.in
--- a/MonetDB5/conf/monetdb5.conf.in    Wed May 12 18:51:00 2010 +0200
+++ b/MonetDB5/conf/monetdb5.conf.in    Thu May 13 11:16:03 2010 +0200
@@ -231,7 +231,7 @@
 
datacyclotron_pipe=inline,remap,evaluate,costModel,coercions,emptySet,aliases,datacyclotron,mergetable,deadcode,constants,commonTerms,joinPath,reorder,deadcode,reduce,dataflow,history,replication,multiplex,garbageCollector
 
 # The default + derivePath
-derive_pipe=inline,remap,evaluate,costModel,coercions,emptySet,aliases,mergetable,deadcode,constants,commonTerms,derivePath,joinPath,deadcode,reduce,dataflow,history,multiplex,garbageCollector
+derive_pipe=inline,remap,evaluate,costModel,coercions,emptySet,aliases,mitosis,mergetable,deadcode,commonTerms,derivePath,joinPath,reorder,deadcode,reduce,dataflow,history,multiplex,garbageCollector
 
 # The default + dictionary
 
dictionary_pipe=inline,remap,dictionary,evaluate,costModel,coercions,emptySet,aliases,mergetable,deadcode,constants,commonTerms,joinPath,deadcode,reduce,dataflow,history,multiplex,garbageCollector
diff -r c725ce4f2e0c -r b071fd025cf9 MonetDB5/src/optimizer/opt_derivepath.mx
--- a/MonetDB5/src/optimizer/opt_derivepath.mx  Wed May 12 18:51:00 2010 +0200
+++ b/MonetDB5/src/optimizer/opt_derivepath.mx  Thu May 13 11:16:03 2010 +0200
@@ -24,57 +24,47 @@
 @verbatim
        (ext1,grp1) := group.new(b);
        (ext2,grp2) := group.derive(ext1,grp1, c);
-       (ext3,grp3) := group.derive(ext2,grp2, d);
+       (ext3,grp3) := group.done(ext2,grp2, d);
 @end verbatim
 The result becomes.
 @verbatim
        (ext3,grp3) := group.derivepath(b,c,d);
 @end verbatim
 
-Another example is where we are looking for distinct values.
-Then we don't need the extend and can fall back to a simple histogram
-trimming the count part. In the fragment below, the ext1 lifetime
-ends at the groups.new function.
-...@verbatim
-       (ext1,grp1) := group.new(b);
-       d:= bat.mirror(ext1);
-       ext1 := nil;
-...@end verbatim
+The implementation of this operator can freely re-order the BATs 
+for reduced intermediate results or as a basis for parallel scanning
+all BATs involved to derive their group id.
 
-...@{
-Experiments on the airtraffic database gave the following results.
-...@verbatim
-SELECT DISTINCT("AirlineID") FROM ontime 
-Looking at cold/hot run "AirlineID" decimal(8,2) DEFAULT NULL,
-cold Timer   15560.180 msec 
-hot Timer    6284.703 msec 
-After addition of the derivePath optimizer to turn it into kunique.
-cold Timer   15328.508 msec
-hot Timer    3713.352 msec
-
-a cold/hot "Year" smallint DEFAULT NULL,
-cold Timer    5517.951 msec 
-hot Timer    1288.417 msec 
-With deriverPath
-cold Timer    9961.625 msec
-hot Timer    3153.128 msec
-turning off derivePath optimizer again
-hot Timer    1318.827 msec
-
-conclusion (after a few more checks)
-perhaps kunique needs some techniques from group.new. 
-...@end verbatim
-SELECT DISTINCT("year") FROM ontime 
+The collection can be extended to immediately aim for group counting
+or summation, avoiding the materialisation of the group id table.
 @mal
 pattern optimizer.derivePath():str
 address OPTderivePath;
 pattern optimizer.derivePath(mod:str, fcn:str):str
 address OPTderivePath
 comment "Join path constructor";
-pattern group.derivePath(l:bat[:oid,:any]...)(:bat[:oid,:any],:bat[:oid,:any])
+
+pattern 
group.derivePath(l:bat[:oid,:any]...)(grp:bat[:oid,:any],ext:bat[:oid,:any])
 address ALGderivePath
-comment "internal routine to handle derivation paths.
-       The type analysis is rather tricky.";
+comment "Derive a group index.";
+
+pattern group.deriveCount(l:bat[:oid,:any]...):bat[:oid,:wrd]
+address ALGderiveCount
+comment "Derive a group count.";
+
+pattern group.deriveSum(s:bat[:oid,:int],l:bat[:oid,:any]...):bat[:oid,:int]
+address ALGderiveSum
+comment "Derive a group sum.";
+pattern group.deriveSum(s:bat[:oid,:lng],l:bat[:oid,:any]...):bat[:oid,:lng]
+address ALGderiveSum
+comment "Derive a group sum.";
+pattern group.deriveSum(s:bat[:oid,:flt],l:bat[:oid,:any]...):bat[:oid,:flt]
+address ALGderiveSum
+comment "Derive a group sum.";
+pattern group.deriveSum(s:bat[:oid,:dbl],l:bat[:oid,:any]...):bat[:oid,:dbl]
+address ALGderiveSum
+comment "Derive a group sum.";
+...@{
 @h
 #ifndef _OPT_DERIVEPATH_
 #define _OPT_DERIVEPATH_
@@ -82,17 +72,20 @@
 #include "opt_support.h"
 #include "mal_interpreter.h"
 
-/* #define DEBUG_OPT_DERIVEPATH  */
+#define DEBUG_OPT_DERIVEPATH  /**/
+opt_export str ALGderivePath(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr pci);
+opt_export str ALGderiveCount(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr pci);
+opt_export str ALGderiveSum(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr pci);
 @c
 #include "mal_config.h"
 #include "opt_derivepath.h"
+#include "group.h"
 
 static int
 OPTderivePathImplementation(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 
InstrPtr p)
 {
-       int i, j, actions=0;
+       int i, actions=0;
        int *pc;
-       str derivePathRef = putName("derivePath",10);
        InstrPtr q;
        InstrPtr *old;
        int limit,slimit;
@@ -121,57 +114,33 @@
 
        for (i = 0; i<limit; i++){
                p= old[i];
-               if (getModuleId(p) == groupRef && p->argc == 3 &&
-                  (getFunctionId(p) == newRef || getFunctionId(p) == doneRef)){
-                       if ( getEndLifespan(span,getArg(p,1)) == i  ){
-                               /* we are only interested in the distinct 
values */
-                               /* trafo:  (ext,grp):= group.new(B);
-                                  becomes: br:= bat.reverse(B);
-                                                       kb:= 
algebra.kunique(br);
-                                                       ext:= bat.reverse(kb);
-                               */
-                               q= newStmt(mb,batRef, reverseRef);
-                               q= pushArgument(mb,q,getArg(p,2));
-                               delArgument(p,1);
-                               clrFunction(p);
-                               setModuleId(p,algebraRef);
-                               setFunctionId(p,kuniqueRef);
-                               getArg(p,1) = getArg(q,0);
-                               j = getArg(p,0);
-                               pushInstruction(mb,p);
-                               q= newStmt(mb,batRef, reverseRef);
-                               pushArgument(mb,q,getArg(q,0));
-                               getArg(p,0) = getArg(q,0);
-                               getArg(q,0) = j;
-                               clrVarUDFtype(mb,j);
-                               clrVarFixed(mb,j);
-                               setVarType(mb,j,TYPE_any);
-                               actions++;
-                               continue;
-                       } else {
+               if (getModuleId(p) == groupRef && p->argc == 3 && 
getFunctionId(p) == newRef ){
+                               setFunctionId(p,derivePathRef);
                                pc[getArg(p,0)] = i;
                                pc[getArg(p,1)] = i;
-                       }
-               }
-               if (getModuleId(p) == groupRef && p->argc == 5 &&
-                  (getFunctionId(p) == deriveRef || getFunctionId(p) == 
doneRef)){
-...@-
-Try to expand its argument list with what we have found so far.
-This creates a series of join paths, many of which will be removed during 
deadcode elimination.
-...@c
-                       if (pc[getArg(p,2)]== pc[getArg(p,3)]){
-                               q= 
copyInstruction(getInstrPtr(mb,pc[getArg(p,2)]));
-                               q= pushArgument(mb,q, getArg(p,4));
-                               getArg(q,0)= getArg(p,0);
-                               getArg(q,1)= getArg(p,1);
-                               pc[getArg(p,0)] = i;
-                               pc[getArg(p,1)] = i;
-                               setFunctionId(q,derivePathRef);
-                               p= q;
                                actions++;
 #ifdef DEBUG_OPT_DERIVEPATH 
                                stream_printf(cntxt->fdout,"new derivePath 
instruction\n");
-                               printInstruction(cntxt->fdout,mb, 0, q, 
LIST_MAL_ALL);
+                               printInstruction(cntxt->fdout,mb, 0, p, 
LIST_MAL_ALL);
+#endif
+               }
+               if (getModuleId(p) == groupRef && p->argc == 5 && 
(getFunctionId(p) == deriveRef || getFunctionId(p) == doneRef)){
+...@-
+Try to expand its argument list with what we have found so far.
+This creates a series of derive paths, many of which will be removed during 
deadcode elimination.
+...@c
+                       if (pc[getArg(p,2)] && pc[getArg(p,2)]== 
pc[getArg(p,3)]){
+                               q= 
copyInstruction(getInstrPtr(mb,pc[getArg(p,2)]));
+                               q= pushArgument(mb,q, getArg(p,4));
+                               getArg(q,0) = getArg(p,0);
+                               getArg(q,1) = getArg(p,1);
+                               pc[getArg(q,0)] = i;
+                               pc[getArg(q,1)] = i;
+                               freeInstruction(p);
+                               p= q;
+#ifdef DEBUG_OPT_DERIVEPATH 
+                               stream_printf(cntxt->fdout,"new derivePath 
instruction extension\n");
+                               printInstruction(cntxt->fdout,mb, 0, p, 
LIST_MAL_ALL);
 #endif
                        }
                } 
@@ -196,21 +165,68 @@
 #include "opt_statistics.h"
 @:wrapOptimizer(derivePath,OPT_CHECK_ALL)@
 @-
-The derive path optimizer takes a derivation sequence and
-attempts to minimize the intermediate result.
-The choice depends on a good estimate of intermediate
-results using properties.
-For the time being, we use a simplistic model, based
-on the assumption that most joins are foreign key joins anyway.
+The derive path optimizer takes a derivation sequence and attempts to minimize 
the intermediate result.
+The choice depends on a good estimate of intermediate results using properties.
+For the time being, we use a simplistic model, based on the assumption that 
most joins are foreign key joins anyway.
 @c
+typedef struct{
+       int *arg;
+       BAT *b;
+} Elm;
 str
 ALGderivePath(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
+       int *grp = (int*) getArgReference(stk,pci,0);
+       int *ext = (int*) getArgReference(stk,pci,1);
+       Elm *path = (Elm *) GDKzalloc(pci->argc * sizeof(Elm));
+       int i, newgrp, newext;
+       str msg = MAL_SUCCEED;
+
+       if ( path == NULL)
+               throw(MAL,"group.derivePath",MAL_MALLOC_FAIL);
+       /* collect the properties (histograms) for all columns */
+       for ( i = 2; i < pci->argc; i++){
+               path[i].arg = (int*) getArgReference(stk,pci,i);
+       }
+       /* determine an optimal order for derivation by reshuffling */
+
+       msg = GRPgroup(grp, ext, path[2].arg);
+       for ( i=3; msg == MAL_SUCCEED && i < pci->argc; i++){
+               msg = GRPderive(&newgrp, &newext, grp, ext, path[i].arg);
+               if ( msg == MAL_SUCCEED){
+                       BBPreleaseref(*grp);
+                       BBPreleaseref(*ext);
+                       *grp = newgrp;
+                       *ext = newext;
+               }
+       }
+       /* remove the histograms */
+       for ( i = 2; i < pci->argc; i++){
+       }
+       GDKfree(path);
+       (void) cntxt;
+       (void) mb;
+       return msg;
+}
+
+str
+ALGderiveCount(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
        (void) cntxt;
        (void) mb;
        (void) stk;
        (void) pci;
-       throw(MAL,"group.derivePath",PROGRAM_NYI);
+       throw(MAL,"group.deriveCount",PROGRAM_NYI);
+}
+
+str
+ALGderiveSum(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+       (void) cntxt;
+       (void) mb;
+       (void) stk;
+       (void) pci;
+       throw(MAL,"group.deriveSum",PROGRAM_NYI);
 }
 
 @}
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to