Changeset: dfdcf4d6bae1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=dfdcf4d6bae1
Modified Files:
gdk/gdk.mx
gdk/gdk_calc.c
monetdb5/modules/kernel/batcalc.c
monetdb5/modules/kernel/batcalc.mal
monetdb5/modules/kernel/batcalc.mal.sh
Branch: gdk-calc
Log Message:
Implemented an overfow-less and precise version of avg.
The old aggr.avg implementation calculates the sum and divides by the
count. This can cause overflow, and it is not precise. Take e.g. the
average of a bat with just two lng values:
9223372036854775807
-9223372036854775806
aggr.avg will give the result 0 whereas the correct result is 0.5.
In order to facilitate aggregation of multiple averages, there is also
a version which returns the average and the count of non-nil values.
diffs (247 lines):
diff --git a/gdk/gdk.mx b/gdk/gdk.mx
--- a/gdk/gdk.mx
+++ b/gdk/gdk.mx
@@ -3355,6 +3355,7 @@ gdk_export int VARcalcbetween(ValPtr ret
gdk_export BAT *BATconvert(BAT *b, int tp, int abort_on_error);
gdk_export int VARconvert(ValPtr ret, const ValRecord *v, int abort_on_error);
+gdk_export int BATcalcavg(BAT *b, dbl *avg, BUN *vals);
/* generic n-ary multijoin beast, with defines to interpret retval */
#define MULTIJOIN_SORTED(r) ((char*) &r)[0]
diff --git a/gdk/gdk_calc.c b/gdk/gdk_calc.c
--- a/gdk/gdk_calc.c
+++ b/gdk/gdk_calc.c
@@ -12555,3 +12555,120 @@ VARconvert(ValPtr ret, const ValRecord *
ATOMname(v->vtype), ATOMname(ret->vtype));
return GDK_FAIL;
}
+
+/* signed version of BUN */
+#if SIZEOF_BUN == SIZEOF_INT
+#define SBUN int
+#else
+#define SBUN lng
+#endif
+
+#define AVERAGE_TYPE(TYPE) \
+ do { \
+ TYPE a = 0, x, an, xn, z1; \
+ for (i = 0; i < cnt; i++) { \
+ x = ((TYPE *) src)[i]; \
+ if (x == TYPE##_nil) \
+ continue; \
+ n++; \
+ /* calculate z1 = (x - a) / n, rounded down \
+ * (towards \ negative infinity), and \
+ * calculate z2 = remainder of \ the division \
+ * (i.e. 0 <= z2 < n); do this without \
+ * causing overflow */ \
+ an = (TYPE) (a / (SBUN) n); \
+ xn = (TYPE) (x / (SBUN) n); \
+ /* z1 will be (x - a) / n rounded towards -INF */ \
+ z1 = xn - an; \
+ xn = x - (TYPE) (xn * (SBUN) n); \
+ an = a - (TYPE) (an * (SBUN) n); \
+ /* z2 will be remainder of above division */ \
+ if (xn >= an) { \
+ z2 = (BUN) (xn - an); \
+ /* loop invariant: (x - a) - z1 * n == z2 */ \
+ while (z2 >= n) { \
+ z2 -= n; \
+ z1++; \
+ } \
+ } else { \
+ z2 = (BUN) (an - xn); \
+ /* loop invariant (until we break): (x - a) -
z1 * n == -z2 */ \
+ for (;;) { \
+ z1--; \
+ if (z2 < n) { \
+ z2 = n - z2; /* proper
remainder */ \
+ break; \
+ } \
+ z2 -= n; \
+ } \
+ } \
+ a += z1; \
+ r += z2; \
+ if (r >= n) { \
+ r -= n; \
+ a++; \
+ } \
+ } \
+ *avg = n > 0 ? a + (dbl) r / n : dbl_nil; \
+ } while (0)
+
+#define AVERAGE_FLOATTYPE(TYPE)
\
+ do { \
+ double a = 0; \
+ TYPE x; \
+ for (i = 0; i < cnt; i++) { \
+ x = ((TYPE *) src)[i]; \
+ if (x == TYPE##_nil) \
+ continue; \
+ n++; \
+ if ((a > 0) == (x > 0)) { \
+ a += (x - a) / (SBUN) n; \
+ } else { \
+ /* no overflow at the cost of an extra \
+ * division and slight loss of \
+ * precision */ \
+ a = a - a / (SBUN) n + x / (SBUN) n; \
+ } \
+ } \
+ *avg = n > 0 ? a : dbl_nil; \
+ } while (0)
+
+int
+BATcalcavg(BAT *b, dbl *avg, BUN *vals)
+{
+ BUN n = 0, r = 0, i = 0, z2, cnt;
+ void *src;
+
+ src = Tloc(b, b->U->first);
+ cnt = BATcount(b);
+
+ BATaccessBegin(b, USE_TAIL, MMAP_SEQUENTIAL);
+ switch (ATOMstorage(b->T->type)) {
+ case TYPE_bte:
+ AVERAGE_TYPE(bte);
+ break;
+ case TYPE_sht:
+ AVERAGE_TYPE(sht);
+ break;
+ case TYPE_int:
+ AVERAGE_TYPE(int);
+ break;
+ case TYPE_lng:
+ AVERAGE_TYPE(lng);
+ break;
+ case TYPE_flt:
+ AVERAGE_FLOATTYPE(flt);
+ break;
+ case TYPE_dbl:
+ AVERAGE_FLOATTYPE(dbl);
+ break;
+ default:
+ GDKerror("BATcalcavg: average of type %s unsupported.\n",
+ ATOMname(b->T->type));
+ return GDK_FAIL;
+ }
+ BATaccessEnd(b, USE_TAIL, MMAP_SEQUENTIAL);
+ if (vals)
+ *vals = n;
+ return GDK_SUCCEED;
+}
diff --git a/monetdb5/modules/kernel/batcalc.c
b/monetdb5/modules/kernel/batcalc.c
--- a/monetdb5/modules/kernel/batcalc.c
+++ b/monetdb5/modules/kernel/batcalc.c
@@ -1298,3 +1298,31 @@ CMDbatBETWEEN(Client cntxt, MalBlkPtr mb
BBPkeepref(*bid = bn->batCacheid);
return MAL_SUCCEED;
}
+
+batcalc_export str CMDcalcavg2(dbl *avg, lng *vals, bat *bid);
+
+str
+CMDcalcavg2(dbl *avg, lng *vals, bat *bid)
+{
+ BAT *b;
+ int ret;
+ BUN v;
+
+ if ((b = BATdescriptor(*bid)) == NULL)
+ throw(MAL, "aggr.avg", RUNTIME_OBJECT_MISSING);
+ ret = BATcalcavg(b, avg, &v);
+ BBPreleaseref(b->batCacheid);
+ if (ret == GDK_FAIL)
+ return mythrow(MAL, "aggr.avg", OPERATION_FAILED);
+ if (vals)
+ *vals = (lng) v;
+ return MAL_SUCCEED;
+}
+
+batcalc_export str CMDcalcavg(dbl *avg, bat *bid);
+
+str
+CMDcalcavg(dbl *avg, bat *bid)
+{
+ return CMDcalcavg2(avg, NULL, bid);
+}
diff --git a/monetdb5/modules/kernel/batcalc.mal
b/monetdb5/modules/kernel/batcalc.mal
--- a/monetdb5/modules/kernel/batcalc.mal
+++ b/monetdb5/modules/kernel/batcalc.mal
@@ -11894,3 +11894,53 @@ pattern between(b:bat[:oid,:oid],lo:oid,
address CMDbatBETWEENcst
comment "B between LO and HI inclusive, nil border is (minus) infinity";
+
+command avg(b:bat[:oid,:bte]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:bte]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:sht]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:sht]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:int]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:int]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:wrd]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:wrd]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:lng]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:lng]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:flt]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:flt]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+command avg(b:bat[:oid,:dbl]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:dbl]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
diff --git a/monetdb5/modules/kernel/batcalc.mal.sh
b/monetdb5/modules/kernel/batcalc.mal.sh
--- a/monetdb5/modules/kernel/batcalc.mal.sh
+++ b/monetdb5/modules/kernel/batcalc.mal.sh
@@ -463,3 +463,16 @@ comment "B between LO and HI inclusive,
EOF
done
+echo
+
+for tp in $numeric; do
+ cat <<EOF
+command avg(b:bat[:oid,:$tp]) :dbl
+address CMDcalcavg
+comment "average of non-nil values of B";
+command avg2(b:bat[:oid,:$tp]) (:dbl, :lng)
+address CMDcalcavg2
+comment "average and number of non-nil values of B";
+
+EOF
+done
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list