Changeset: 498738535dfe for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=498738535dfe
Modified Files:
clients/Tests/exports.stable.out
gdk/gdk_aggr.c
gdk/gdk_calc.h
monetdb5/modules/kernel/aggr.c
monetdb5/modules/kernel/aggr.mal
monetdb5/modules/kernel/aggr.mal.sh
monetdb5/modules/kernel/algebra.mx
sql/backends/monet5/sql.mx
sql/backends/monet5/sql_scenario.c
sql/scripts/39_analytics.sql
Branch: default
Log Message:
Implemented {stddev,var}_{samp,pop} SQL aggregates using a single scan.
The SQL aggregates stddev is gone and has been replaced with the
standard-compliant stddev_samp. In addition we now also have
stddev_pop, and the variance aggregates var_samp and var_pop. All
four aggregates return a DOUBLE value.
diffs (truncated from 1018 to 300 lines):
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -94,6 +94,8 @@ dbl BATcalcstdev_population(dbl *avgp, B
dbl BATcalcstdev_sample(dbl *avgp, BAT *b);
BAT *BATcalcsub(BAT *b1, BAT *b2, BAT *s, int tp, int abort_on_error);
BAT *BATcalcsubcst(BAT *b, const ValRecord *v, BAT *s, int tp, int
abort_on_error);
+dbl BATcalcvariance_population(dbl *avgp, BAT *b);
+dbl BATcalcvariance_sample(dbl *avgp, BAT *b);
BAT *BATcalcxor(BAT *b1, BAT *b2, BAT *s);
BAT *BATcalcxorcst(BAT *b, const ValRecord *v, BAT *s);
BAT *BATclear(BAT *b, int force);
@@ -129,6 +131,8 @@ BAT *BATgroupsize(BAT *b, BAT *g, BAT *e
BAT *BATgroupstdev_population(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int
skip_nils, int abort_on_error);
BAT *BATgroupstdev_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int
skip_nils, int abort_on_error);
BAT *BATgroupsum(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int skip_nils, int
abort_on_error);
+BAT *BATgroupvariance_population(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int
skip_nils, int abort_on_error);
+BAT *BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp, int
skip_nils, int abort_on_error);
BUN BATgrows(BAT *b);
BAT *BAThash(BAT *b, BUN masksize);
BAT *BAThashjoin(BAT *l, BAT *r, BUN estimate);
@@ -675,6 +679,10 @@ str AGGRsubsumcand_int(bat *retval, bat
str AGGRsubsumcand_lng(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid,
int *skip_nils, int *abort_on_error);
str AGGRsubsumcand_sht(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid,
int *skip_nils, int *abort_on_error);
str AGGRsubsumcand_wrd(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid,
int *skip_nils, int *abort_on_error);
+str AGGRsubvariance_dbl(bat *retval, bat *bid, bat *gid, bat *eid, int
*skip_nils, int *abort_on_error);
+str AGGRsubvariancecand_dbl(bat *retval, bat *bid, bat *gid, bat *eid, bat
*sid, int *skip_nils, int *abort_on_error);
+str AGGRsubvariancep_dbl(bat *retval, bat *bid, bat *gid, bat *eid, int
*skip_nils, int *abort_on_error);
+str AGGRsubvariancepcand_dbl(bat *retval, bat *bid, bat *gid, bat *eid, bat
*sid, int *skip_nils, int *abort_on_error);
str AGGRsubxml(bat *retval, bat *bid, bat *gid, bat *eid, int *skip_nils);
str AGGRsubxmlcand(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid, int
*skip_nils);
str AGGRsum2_bte(bat *retval, bat *bid, bat *eid);
@@ -691,6 +699,10 @@ str AGGRsum3_int(bat *retval, bat *bid,
str AGGRsum3_lng(bat *retval, bat *bid, bat *gid, bat *eid);
str AGGRsum3_sht(bat *retval, bat *bid, bat *gid, bat *eid);
str AGGRsum3_wrd(bat *retval, bat *bid, bat *gid, bat *eid);
+str AGGRvariance2_dbl(bat *retval, bat *bid, bat *eid);
+str AGGRvariance3_dbl(bat *retval, bat *bid, bat *gid, bat *eid);
+str AGGRvariancep2_dbl(bat *retval, bat *bid, bat *eid);
+str AGGRvariancep3_dbl(bat *retval, bat *bid, bat *gid, bat *eid);
str AGGRxml(bat *retval, bat *bid, int *skip_nils);
str ALARMctime(str *res);
str ALARMepilogue(void);
@@ -844,6 +856,8 @@ str ALGtunique(int *result, int *bid);
str ALGuselect(int *result, int *bid, ptr low, ptr high);
str ALGuselect1(int *result, int *bid, ptr value);
str ALGuselectInclusive(int *result, int *bid, ptr low, ptr high, bit *lin,
bit *rin);
+str ALGvariance(dbl *res, int *bid);
+str ALGvariancep(dbl *res, int *bid);
str ARRAYgridBAT_int(int *ret, int *bid, int *groups, int *groupsize, int
*clustersize, int *offset);
str ARRAYgridBAT_lng(lng *ret, lng *bid, lng *groups, lng *groupsize, lng
*clustersize, lng *offset);
str ARRAYgridBATshift_int(int *ret, int *bid, int *groups, int *groupsize, int
*clustersize, int *offset, int *shift);
diff --git a/gdk/gdk_aggr.c b/gdk/gdk_aggr.c
--- a/gdk/gdk_aggr.c
+++ b/gdk/gdk_aggr.c
@@ -2133,7 +2133,7 @@ BATgroupmedian(BAT *b, BAT *g, BAT *e, B
} while (0)
static dbl
-calcstdev(dbl *avgp, const void *values, BUN cnt, int tp, int issample)
+calcvariance(dbl *avgp, const void *values, BUN cnt, int tp, int issample)
{
BUN n = 0, i;
dbl mean = 0;
@@ -2171,21 +2171,37 @@ calcstdev(dbl *avgp, const void *values,
}
if (avgp)
*avgp = mean;
- return sqrt(m2 / (n - issample));
+ return m2 / (n - issample);
}
dbl
BATcalcstdev_population(dbl *avgp, BAT *b)
{
- return calcstdev(avgp, (const void *) Tloc(b, BUNfirst(b)),
- BATcount(b), b->ttype, 0);
+ dbl v = calcvariance(avgp, (const void *) Tloc(b, BUNfirst(b)),
+ BATcount(b), b->ttype, 0);
+ return v == dbl_nil ? dbl_nil : sqrt(v);
}
dbl
BATcalcstdev_sample(dbl *avgp, BAT *b)
{
- return calcstdev(avgp, (const void *) Tloc(b, BUNfirst(b)),
- BATcount(b), b->ttype, 1);
+ dbl v = calcvariance(avgp, (const void *) Tloc(b, BUNfirst(b)),
+ BATcount(b), b->ttype, 1);
+ return v == dbl_nil ? dbl_nil : sqrt(v);
+}
+
+dbl
+BATcalcvariance_population(dbl *avgp, BAT *b)
+{
+ return calcvariance(avgp, (const void *) Tloc(b, BUNfirst(b)),
+ BATcount(b), b->ttype, 0);
+}
+
+dbl
+BATcalcvariance_sample(dbl *avgp, BAT *b)
+{
+ return calcvariance(avgp, (const void *) Tloc(b, BUNfirst(b)),
+ BATcount(b), b->ttype, 1);
}
#define AGGR_STDEV(TYPE) \
@@ -2224,9 +2240,12 @@ BATcalcstdev_sample(dbl *avgp, BAT *b)
mean[i] = dbl_nil; \
nils++; \
} else if (cnts[i] == 1) { \
- dbls[i] = 0; \
+ dbls[i] = issample ? dbl_nil : 0; \
+ nils2++; \
+ } else if (variance) { \
+ dbls[i] = m2[i] / (cnts[i] - issample); \
} else { \
- dbls[i] = m2[i] / (cnts[i] - issample); \
+ dbls[i] = sqrt(m2[i] / (cnts[i] - issample)); \
} \
} \
} while (0)
@@ -2241,13 +2260,13 @@ BATcalcstdev_sample(dbl *avgp, BAT *b)
* aggregates. */
static BAT *
dogroupstdev(BAT **avgb, BAT *b, BAT *g, BAT *e, BAT *s, int tp,
- int skip_nils, int issample, const char *func)
+ int skip_nils, int issample, int variance, const char *func)
{
const oid *gids;
oid gid;
oid min, max;
BUN i, ngrp;
- BUN nils = 0;
+ BUN nils = 0, nils2 = 0;
BUN *cnts = NULL;
dbl *dbls, *mean, *delta, *m2;
BAT *bn = NULL;
@@ -2362,6 +2381,7 @@ dogroupstdev(BAT **avgb, BAT *b, BAT *g,
} else {
GDKfree(mean);
}
+ nils += nils2;
GDKfree(delta);
GDKfree(m2);
GDKfree(cnts);
@@ -2392,7 +2412,7 @@ BATgroupstdev_sample(BAT *b, BAT *g, BAT
int skip_nils, int abort_on_error)
{
(void) abort_on_error;
- return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 1,
+ return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 1, 0,
"BATgroupstdev_sample");
}
@@ -2401,6 +2421,24 @@ BATgroupstdev_population(BAT *b, BAT *g,
int skip_nils, int abort_on_error)
{
(void) abort_on_error;
- return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 0,
+ return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 0, 0,
"BATgroupstdev_population");
}
+
+BAT *
+BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp,
+ int skip_nils, int abort_on_error)
+{
+ (void) abort_on_error;
+ return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 1, 1,
+ "BATgroupvariance_sample");
+}
+
+BAT *
+BATgroupvariance_population(BAT *b, BAT *g, BAT *e, BAT *s, int tp,
+ int skip_nils, int abort_on_error)
+{
+ (void) abort_on_error;
+ return dogroupstdev(NULL, b, g, e, s, tp, skip_nils, 0, 1,
+ "BATgroupvariance_population");
+}
diff --git a/gdk/gdk_calc.h b/gdk/gdk_calc.h
--- a/gdk/gdk_calc.h
+++ b/gdk/gdk_calc.h
@@ -140,3 +140,7 @@ gdk_export dbl BATcalcstdev_population(d
gdk_export dbl BATcalcstdev_sample(dbl *avgp, BAT *b);
gdk_export BAT *BATgroupstdev_sample(BAT *b, BAT *g, BAT *e, BAT *s, int tp,
int skip_nils, int abort_on_error);
gdk_export BAT *BATgroupstdev_population(BAT *b, BAT *g, BAT *e, BAT *s, int
tp, int skip_nils, int abort_on_error);
+gdk_export dbl BATcalcvariance_population(dbl *avgp, BAT *b);
+gdk_export dbl BATcalcvariance_sample(dbl *avgp, BAT *b);
+gdk_export BAT *BATgroupvariance_sample(BAT *b, BAT *g, BAT *e, BAT *s, int
tp, int skip_nils, int abort_on_error);
+gdk_export BAT *BATgroupvariance_population(BAT *b, BAT *g, BAT *e, BAT *s,
int tp, int skip_nils, int abort_on_error);
diff --git a/monetdb5/modules/kernel/aggr.c b/monetdb5/modules/kernel/aggr.c
--- a/monetdb5/modules/kernel/aggr.c
+++ b/monetdb5/modules/kernel/aggr.c
@@ -434,6 +434,36 @@ AGGRstdevp2_dbl(bat *retval, bat *bid, b
return AGGRgrouped2(retval, bid, eid, TYPE_dbl,
BATgroupstdev_population, 1, "aggr.stdevp");
}
+aggr_export str AGGRvariance3_dbl(bat *retval, bat *bid, bat *gid, bat *eid);
+str
+AGGRvariance3_dbl(bat *retval, bat *bid, bat *gid, bat *eid)
+{
+ return AGGRgrouped3(retval, bid, gid, eid, TYPE_dbl,
+ BATgroupvariance_sample, 1,
"aggr.variance");
+}
+
+aggr_export str AGGRvariance2_dbl(bat *retval, bat *bid, bat *eid);
+str
+AGGRvariance2_dbl(bat *retval, bat *bid, bat *eid)
+{
+ return AGGRgrouped2(retval, bid, eid, TYPE_dbl,
BATgroupvariance_sample, 1, "aggr.variance");
+}
+
+aggr_export str AGGRvariancep3_dbl(bat *retval, bat *bid, bat *gid, bat *eid);
+str
+AGGRvariancep3_dbl(bat *retval, bat *bid, bat *gid, bat *eid)
+{
+ return AGGRgrouped3(retval, bid, gid, eid, TYPE_dbl,
+ BATgroupvariance_population, 1,
"aggr.variancep");
+}
+
+aggr_export str AGGRvariancep2_dbl(bat *retval, bat *bid, bat *eid);
+str
+AGGRvariancep2_dbl(bat *retval, bat *bid, bat *eid)
+{
+ return AGGRgrouped2(retval, bid, eid, TYPE_dbl,
BATgroupvariance_population, 1, "aggr.variancep");
+}
+
aggr_export str AGGRcount3(bat *retval, bat *bid, bat *gid, bat *eid, bit
*ignorenils);
str
AGGRcount3(bat *retval, bat *bid, bat *gid, bat *eid, bit *ignorenils)
@@ -868,6 +898,38 @@ AGGRsubstdevpcand_dbl(bat *retval, bat *
*abort_on_error, TYPE_dbl,
BATgroupstdev_population, "aggr.substdevp");
}
+aggr_export str AGGRsubvariance_dbl(bat *retval, bat *bid, bat *gid, bat *eid,
int *skip_nils, int *abort_on_error);
+str
+AGGRsubvariance_dbl(bat *retval, bat *bid, bat *gid, bat *eid, int *skip_nils,
int *abort_on_error)
+{
+ return AGGRsubgrouped(retval, bid, gid, eid, NULL, *skip_nils,
+ *abort_on_error, TYPE_dbl,
BATgroupvariance_sample, "aggr.subvariance");
+}
+
+aggr_export str AGGRsubvariancecand_dbl(bat *retval, bat *bid, bat *gid, bat
*eid, bat *sid, int *skip_nils, int *abort_on_error);
+str
+AGGRsubvariancecand_dbl(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid,
int *skip_nils, int *abort_on_error)
+{
+ return AGGRsubgrouped(retval, bid, gid, eid, sid, *skip_nils,
+ *abort_on_error, TYPE_dbl,
BATgroupvariance_sample, "aggr.subvariance");
+}
+
+aggr_export str AGGRsubvariancep_dbl(bat *retval, bat *bid, bat *gid, bat
*eid, int *skip_nils, int *abort_on_error);
+str
+AGGRsubvariancep_dbl(bat *retval, bat *bid, bat *gid, bat *eid, int
*skip_nils, int *abort_on_error)
+{
+ return AGGRsubgrouped(retval, bid, gid, eid, NULL, *skip_nils,
+ *abort_on_error, TYPE_dbl,
BATgroupvariance_population, "aggr.subvariancep");
+}
+
+aggr_export str AGGRsubvariancepcand_dbl(bat *retval, bat *bid, bat *gid, bat
*eid, bat *sid, int *skip_nils, int *abort_on_error);
+str
+AGGRsubvariancepcand_dbl(bat *retval, bat *bid, bat *gid, bat *eid, bat *sid,
int *skip_nils, int *abort_on_error)
+{
+ return AGGRsubgrouped(retval, bid, gid, eid, sid, *skip_nils,
+ *abort_on_error, TYPE_dbl,
BATgroupvariance_population, "aggr.subvariancep");
+}
+
aggr_export str AGGRsubcount(bat *retval, bat *bid, bat *gid, bat *eid, int
*skip_nils);
str
AGGRsubcount(bat *retval, bat *bid, bat *gid, bat *eid, int *skip_nils)
diff --git a/monetdb5/modules/kernel/aggr.mal b/monetdb5/modules/kernel/aggr.mal
--- a/monetdb5/modules/kernel/aggr.mal
+++ b/monetdb5/modules/kernel/aggr.mal
@@ -724,6 +724,38 @@ command substdevp(b:bat[:oid,:bte],g:bat
address AGGRsubstdevpcand_dbl
comment "Grouped stdevp aggregate with candidates list";
+command variance(b:bat[:oid,:bte], e:bat[:oid,:any_1]) :bat[:oid,:dbl]
+address AGGRvariance2_dbl
+comment "Grouped tail average on bte";
+
+command variance(b:bat[:oid,:bte], g:bat[:oid,:oid],
e:bat[:oid,:any_1]):bat[:oid,:dbl]
+address AGGRvariance3_dbl
+comment "Grouped tail average on bte";
+
+command
subvariance(b:bat[:oid,:bte],g:bat[:oid,:oid],e:bat[:oid,:any_1],skip_nils:int,abort_on_error:int)
:bat[:oid,:dbl]
+address AGGRsubvariance_dbl
+comment "Grouped variance aggregate";
+
+command
subvariance(b:bat[:oid,:bte],g:bat[:oid,:oid],e:bat[:oid,:any_1],s:bat[:oid,:oid],skip_nils:int,abort_on_error:int)
:bat[:oid,:dbl]
+address AGGRsubvariancecand_dbl
+comment "Grouped variance aggregate with candidates list";
+
+command variancep(b:bat[:oid,:bte], e:bat[:oid,:any_1]) :bat[:oid,:dbl]
+address AGGRvariancep2_dbl
+comment "Grouped tail average on bte";
+
+command variancep(b:bat[:oid,:bte], g:bat[:oid,:oid],
e:bat[:oid,:any_1]):bat[:oid,:dbl]
+address AGGRvariancep3_dbl
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list