Changeset: 4edd9147773b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4edd9147773b
Modified Files:
monetdb5/modules/mal/groupby.c
monetdb5/modules/mal/groupby.h
monetdb5/modules/mal/groupby.mal
Branch: default
Log Message:
Second round of groupby API
RFC on this interface set for the both improved mergetable
and distributed processing.
diffs (176 lines):
diff --git a/monetdb5/modules/mal/groupby.c b/monetdb5/modules/mal/groupby.c
--- a/monetdb5/modules/mal/groupby.c
+++ b/monetdb5/modules/mal/groupby.c
@@ -19,17 +19,15 @@
/*
* (c) Martin Kersten
- * Group-by support
+ * Multicolumn group-by support
* The group-by support module is meant to replace and speedup the kernel
grouping routines.
* The latter was originally designed in a memory constraint setting and an
exercise in
* performing column-wise grouping incrementally. The effect is that these
routines are
* now a major performance hindrances.
*
- * This module again takes the columnar approach to grouping, but supports for
more
- * parallelism in achieving these goals.
- *
- * The target is to support SQL-like group_by operations, which are lists of
- * attributes (reduced by a pivot list) followed by a group aggregate function.
+ * The target is to support SQL-like multicolumngroup_by operations, which are
lists of
+ * attributes and a group aggregate function.
+ * Each group can be represented with an oid into the n-ary table.
* Consider the query "select count(*), max(A) from R group by A, B,C." whose
code
* snippet in MAL would become something like:
* @verbatim
@@ -39,15 +37,18 @@
* ...
* _9 := algebra.select(_1,0,100);
* ..
- * grp:bat[:oid,:oid] := groupby.id(_9, _1, _2, _3);
- * grp_4:bat[:oid,:wrd] := groupby.count(_9, _1, _2, _3);
- * grp_5:bat[:oid,:lng] := groupby.max(_9,_2, _3, _1);
+ * (grp_4:bat[:oid,:wrd], gid:bat[:oid,:oid]) := groupby.count(_9, _1,_2 _3);
+ * (grp_5:bat[:oid,:lng], gid:bat[:oid,:oid]) := groupby.max(_9,_2, _1,_2,_3);
* @end verbatim
*
+ * All instructions have a candidate oid list.
* The id() function merely becomes the old-fashioned oid-based group
identification list.
* This way related values can be obtained from the attribute columns. It can
be the input
* for the count() function, which saves some re-computation.
*
+ * Aside the group ids, we also provide options to return the value based
aggregate table
+ * to ease development of parallel plans.
+ *
* The implementation is optimized for a limited number of groups. The default
is
* to fall back on the old code sequences.
*
@@ -163,6 +164,46 @@ GROUPid(Client cntxt, MalBlkPtr mb, MalS
}
str
+GROUPcountTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+ (void) cntxt;
+ (void) mb;
+ (void) stk;
+ (void) pci;
+ return MAL_SUCCEED;
+}
+
+str
+GROUPmaxTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+ (void) cntxt;
+ (void) mb;
+ (void) stk;
+ (void) pci;
+ return MAL_SUCCEED;
+}
+
+str
+GROUPminTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+ (void) cntxt;
+ (void) mb;
+ (void) stk;
+ (void) pci;
+ return MAL_SUCCEED;
+}
+
+str
+GROUPavgTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+ (void) cntxt;
+ (void) mb;
+ (void) stk;
+ (void) pci;
+ return MAL_SUCCEED;
+}
+
+str
GROUPcount(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
{
int *ret = (int*) getArgReference(stk,pci,0);
diff --git a/monetdb5/modules/mal/groupby.h b/monetdb5/modules/mal/groupby.h
--- a/monetdb5/modules/mal/groupby.h
+++ b/monetdb5/modules/mal/groupby.h
@@ -35,11 +35,16 @@
#define group_by_export extern
#endif
+group_by_export str GROUPmulticolumn(Client cntxt, MalBlkPtr mb, MalStkPtr
stk, InstrPtr pci);
+
group_by_export str GROUPid(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
group_by_export str GROUPcount(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
+group_by_export str GROUPcountTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
group_by_export str GROUPmax(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
+group_by_export str GROUPmaxTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
group_by_export str GROUPmin(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
+group_by_export str GROUPminTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
group_by_export str GROUPavg(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
-group_by_export str GROUPmulticolumn(Client cntxt, MalBlkPtr mb, MalStkPtr
stk, InstrPtr pci);
+group_by_export str GROUPavgTable(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
InstrPtr pci);
#endif /* _GROUPBY_H */
diff --git a/monetdb5/modules/mal/groupby.mal b/monetdb5/modules/mal/groupby.mal
--- a/monetdb5/modules/mal/groupby.mal
+++ b/monetdb5/modules/mal/groupby.mal
@@ -17,27 +17,46 @@
module group;
-pattern group.subgroup(b:bat[:oid,:any]...)(ref:bat[:oid,:oid],
grp:bat[:oid,:any],ext:bat[:oid,:any])
+pattern group.subgroup(b:bat[:oid,:any]...)(ref:bat[:oid,:oid],
grp:bat[:oid,:any], ext:bat[:oid,:any])
address GROUPmulticolumn
-comment "Old-fashioned derivation of a group index over multiple columns.";
+comment "Old-fashioned derivation of a group index over multiple columns. A
first oid- argument is interpreted as a candidate list";
+
+pattern group.id(cand:bat[:oid,:oid],b:bat[:oid,:any]...) :bat[:oid,:oid]
+address GROUPid
+comment "Derive a group id for all n-tuples. A first oid-argument is
interpreted as a candidate list or NULL";
pattern group.id(b:bat[:oid,:any]...) :bat[:oid,:oid]
address GROUPid
-comment "Derive the grouping of all tuples in the BATs.";
+comment "Derive a group id for all n-tuples.";
-pattern group.count(b:bat[:oid,:any]...) (o:bat[:oid,:oid],cnt:bat[:oid,:wrd])
+pattern group.count(cand:bat[:oid,:oid],b:bat[:oid,:any]...)
(cnt:bat[:oid,:wrd], b:bat[:oid,:any]...)
+address GROUPcountTable
+comment "Derive a group table for all n-tuples and their count. The first
argument is a candidate list";
+
+pattern group.count(cand:bat[:oid,:oid],b:bat[:oid,:any]...)
(cnt:bat[:oid,:wrd], gid:bat[:oid,:oid])
address GROUPcount
-comment "Derive the grouping of all tuples in the BATs and derive their count
them.";
+comment "Derive a group id for all n-tuples and their count. The first
argument is a candidate list";
-pattern group.max(target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(o:bat[:oid,:oid],:bat[:oid,:any_1])
+pattern
group.max(cand:bat[:oid,:oid],target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(mx:bat[:oid,:any_1], o:bat[:oid,:any]...)
+address GROUPmaxTable
+comment "Derive the group table of all n-tuples and determine their maximum
value. The first argument is a candidate list";
+
+pattern
group.max(cand:bat[:oid,:oid],target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(mx:bat[:oid,:any_1], gid:bat[:oid,:oid])
address GROUPmax
-comment "Derive the grouping of all tuples in the BATs and determine their
maximum value.";
+comment "Derive the group id of all n-tuples and determine their maximum
value. The first argument is a candidate list";
-pattern group.min(target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(o:bat[:oid,:oid],mi:bat[:oid,:any_1])
+pattern
group.min(cand:bat[:oid,:oid],target:bat[:oid,:any_1],b:bat[:oid,:any]...)
(mi:bat[:oid,:any_1], o:bat[:oid,:any]...)
+address GROUPminTable
+comment "Derive the grop id of all n-tuples and determine their minimum value.
The first argument is a candidate list";
+
+pattern group.min(cand:bat[:oid,:oid],
target:bat[:oid,:any_1],b:bat[:oid,:any]...) (mi:bat[:oid,:any_1],
gid:bat[:oid,:oid])
address GROUPmin
-comment "Derive the grouping of all tuples in the BATs and their determine
minimum value.";
+comment "Derive the group id of all n-tuples and determin their minimum value.
The first argument is a candidate list";
-pattern group.avg(pivot:bat[:oid,:any_1],b:bat[:oid,:any_2]...)
(o:bat[:oid,:oid],a:bat[:oid,:any_2])
+pattern group.avg(cand:bat[:oid,:oid],
target:bat[:oid,:any_1],b:bat[:oid,:any_2]...) (a:bat[:oid,:dbl])
address GROUPavg
-comment "Derive the grouping of all tuples in the BATs and determine their
average value.";
+comment "Derive the group id of all n-tuples and determine their average
value. The first argument is a candidate list";
+pattern group.avg(cand:bat[:oid,:oid],
target:bat[:oid,:any_1],b:bat[:oid,:any_2]...) (a:bat[:oid,:dbl],
o:bat[:oid,:any]...)
+address GROUPavgTable
+comment "Derive the group id of all n-tuples and determine their average
value. The first argument is a candidate list";
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list