This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit decf1c2924011e40d68aefd89f84ad3fa2080196 Author: pranavyl <pranav.lo...@cloudera.com> AuthorDate: Fri Dec 8 19:11:28 2023 +0100 IMPALA-12744: Support for regr_count() aggregate/analytic function The regr_count() function can be used both as an aggregate and an analytic function and is commonly used in regression analysis. regr_count(y, x) returns an integer that is the number of non-null number pairs. It indicates how many observations are included in the analysis. Testing: The functions are extensively tested and cross-checked with Hive. The tests can be found in aggregation.test. Change-Id: I773d1e0edc8a9c8ee003f75721f4844685b2eb38 Reviewed-on: http://gerrit.cloudera.org:8080/20936 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/exprs/aggregate-functions-ir.cc | 33 +++ be/src/exprs/aggregate-functions.h | 10 + .../java/org/apache/impala/catalog/BuiltinsDb.java | 19 ++ .../queries/QueryTest/aggregation.test | 243 +++++++++++++++++++++ 4 files changed, 305 insertions(+) diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc index bf41acb04..c9e4b0439 100644 --- a/be/src/exprs/aggregate-functions-ir.cc +++ b/be/src/exprs/aggregate-functions-ir.cc @@ -288,6 +288,39 @@ void AggregateFunctions::CountMerge(FunctionContext*, const BigIntVal& src, dst->val += src.val; } +// regr_count(y, x) returns an integer that is the number of non-null +// number pairs. It indicates how many observations are included in the +// analysis. +void AggregateFunctions::RegrCountUpdate( + FunctionContext*, const DoubleVal& src1, const DoubleVal& src2, BigIntVal* dst) { + DCHECK(!dst->is_null); + if (!src1.is_null && !src2.is_null) ++dst->val; +} + +void AggregateFunctions::RegrCountRemove( + FunctionContext*, const DoubleVal& src1, const DoubleVal& src2, BigIntVal* dst) { + DCHECK(!dst->is_null); + if (!src1.is_null && !src2.is_null) { + --dst->val; + DCHECK_GE(dst->val, 0); + } +} + +void AggregateFunctions::TimestampRegrCountUpdate(FunctionContext*, + const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst) { + DCHECK(!dst->is_null); + if (!src1.is_null && !src2.is_null) ++dst->val; +} + +void AggregateFunctions::TimestampRegrCountRemove(FunctionContext*, + const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst) { + DCHECK(!dst->is_null); + if (!src1.is_null && !src2.is_null) { + --dst->val; + DCHECK_GE(dst->val, 0); + } +} + // Implementation of regr_slope() and regr_intercept(): // RegrSlopeState is used for implementing regr_slope() and regr_intercept(). // regr_slope() and regr_intercept() take two arguments of numeric type and return the diff --git a/be/src/exprs/aggregate-functions.h b/be/src/exprs/aggregate-functions.h index c261e80aa..13aadb8e9 100644 --- a/be/src/exprs/aggregate-functions.h +++ b/be/src/exprs/aggregate-functions.h @@ -66,6 +66,16 @@ class AggregateFunctions { static StringVal StringValSerializeOrFinalize( FunctionContext* ctx, const StringVal& src); + /// Implementation of Regr_Count() + static void RegrCountUpdate(FunctionContext*, const DoubleVal& src1, + const DoubleVal& src2, BigIntVal* dst); + static void RegrCountRemove(FunctionContext*, const DoubleVal& src1, + const DoubleVal& src2, BigIntVal* dst); + static void TimestampRegrCountUpdate(FunctionContext*, + const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst); + static void TimestampRegrCountRemove(FunctionContext*, + const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst); + /// Implementation of regr_slope() and regr_intercept() static void RegrSlopeInit(FunctionContext* ctx, StringVal* dst); static void RegrSlopeUpdate(FunctionContext* ctx, const DoubleVal& src1, diff --git a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java index 1f1532761..69b3b6109 100644 --- a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java +++ b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java @@ -1367,6 +1367,25 @@ public class BuiltinsDb extends Db { prefix + "12CorrFinalizeEPN10impala_udf15FunctionContextERKNS1_9StringValE", false, true, false)); + // Regr_count() + db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_count", + Lists.<Type>newArrayList(Type.DOUBLE, Type.DOUBLE), Type.BIGINT, Type.BIGINT, + prefix + "8InitZeroIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextEPT_", + prefix + "15RegrCountUpdateEPN10impala_udf15FunctionContextERKNS1_9DoubleValES6_PNS1_9BigIntValE", + prefix + "10CountMergeEPN10impala_udf15FunctionContextERKNS1_9BigIntValEPS4_", + null, null, + prefix + "15RegrCountRemoveEPN10impala_udf15FunctionContextERKNS1_9DoubleValES6_PNS1_9BigIntValE", + null, false, true, true)); + + db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_count", + Lists.<Type>newArrayList(Type.TIMESTAMP, Type.TIMESTAMP), Type.BIGINT, Type.BIGINT, + prefix + "8InitZeroIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextEPT_", + prefix + "24TimestampRegrCountUpdateEPN10impala_udf15FunctionContextERKNS1_12TimestampValES6_PNS1_9BigIntValE", + prefix + "10CountMergeEPN10impala_udf15FunctionContextERKNS1_9BigIntValEPS4_", + null, null, + prefix + "24TimestampRegrCountRemoveEPN10impala_udf15FunctionContextERKNS1_12TimestampValES6_PNS1_9BigIntValE", + null, false, true, true)); + // Regr_r2() db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_r2", Lists.<Type>newArrayList(Type.DOUBLE, Type.DOUBLE), Type.DOUBLE, Type.STRING, diff --git a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test index df8162d14..39c4d7d75 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test +++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test @@ -2667,3 +2667,246 @@ select s_store_sk, regr_r2(s_floor_space, s_number_employees) over (partition by ---- TYPES int, double ==== +---- QUERY +# regr_count() function examples +select regr_count(ps_availqty, ps_supplycost) from tpch.partsupp; +---- RESULTS +800000 +---- TYPES +bigint +==== +---- QUERY +# Behavior of regr_count() on null table +select regr_count(d, e) from functional.nulltable; +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +# Behavior of regr_count() on empty table +select regr_count(f2, f2) from functional.emptytable; +---- RESULTS +0 +---- TYPES +bigint +==== +---- QUERY +# regr_count() on different datatypes +select regr_count(tinyint_col, tinyint_col), regr_count(smallint_col, smallint_col), + regr_count(int_col, int_col), regr_count(bigint_col, bigint_col), regr_count(float_col, float_col), + regr_count(double_col, double_col), regr_count(timestamp_col, timestamp_col) from functional.alltypes; +---- RESULTS +7300,7300,7300,7300,7300,7300,7300 +---- TYPES +bigint,bigint,bigint,bigint,bigint,bigint,bigint +==== +---- QUERY +# regr_count() on timestamp columns +select regr_count(utctime, localtime) from functional.alltimezones; +---- RESULTS +857 +---- TYPES +bigint +==== +---- QUERY +# Since group by id will result in a single row, this test shows that regr_count() returns 1 in case of a single row. +select id, regr_count(int_col, int_col) from functional.alltypestiny group by id; +---- RESULTS +2,1 +4,1 +0,1 +6,1 +1,1 +7,1 +3,1 +5,1 +---- TYPES +int,bigint +==== +---- QUERY +# regr_count() on decimal datatype +select regr_count(d3, d4) from functional.decimal_tbl; +---- RESULTS +5 +---- TYPES +bigint +==== +---- QUERY +select year, regr_count(double_col, double_col) from functional.alltypes group by year; +---- RESULTS +2009,3650 +2010,3650 +---- TYPES +int,bigint +==== +---- QUERY +select regr_count(double_col, -double_col) from functional.alltypes; +---- RESULTS +7300 +---- TYPES +bigint +==== +---- QUERY +select regr_count(double_col, double_col) from functional.alltypes; +---- RESULTS +7300 +---- TYPES +bigint +==== +---- QUERY +select regr_count(ss_sold_time_sk, ss_quantity) from tpcds.store_sales; +---- RESULTS +2685616 +---- TYPES +bigint +==== +---- QUERY +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk) + from tpcds.store; +---- RESULTS +5,1 +8,2 +12,3 +1,1 +2,2 +3,3 +4,4 +6,5 +7,6 +9,7 +10,8 +11,9 +---- TYPES +int,bigint +==== +---- QUERY +select id, double_col, regr_count(double_col, int_col) over (partition by month order by id) from functional.alltypes + order by id limit 10; +---- RESULTS +0,0.0,1 +1,10.1,2 +2,20.2,3 +3,30.3,4 +4,40.4,5 +5,50.5,6 +6,60.6,7 +7,70.7,8 +8,80.8,9 +9,90.9,10 +---- TYPES +int,double,bigint +==== +---- QUERY +# regr_count() when one column is filled with null +select regr_count(null_int, rand()), regr_count(rand(), null_int) from functional.nullrows; +---- RESULTS +0,0 +---- TYPES +bigint,bigint +==== +---- QUERY +# regr_count() supporting join +select regr_count(A.double_col, B.double_col) from functional.alltypes A, functional.alltypes B where A.id=B.id; +---- RESULTS +7300 +---- TYPES +bigint +==== +---- QUERY +# Tests functioning of regrCountRemove() +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk + rows between 5 preceding and 2 following) from tpcds.store; +---- RESULTS +5,3 +8,3 +12,3 +1,3 +2,4 +3,5 +4,6 +6,7 +7,8 +9,8 +10,7 +11,6 +---- TYPES +int,bigint +==== +---- QUERY +# regr_count() when window size is 2 +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk + rows between 1 preceding and current row) from tpcds.store; +---- RESULTS +5,1 +8,2 +12,2 +1,1 +2,2 +3,2 +4,2 +6,2 +7,2 +9,2 +10,2 +11,2 +---- TYPES +int,bigint +==== +---- QUERY +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk + rows between current row and 1 following) from tpcds.store; +---- RESULTS +5,2 +8,2 +12,1 +1,2 +2,2 +3,2 +4,2 +6,2 +7,2 +9,2 +10,2 +11,1 +---- TYPES +int,bigint +==== +---- QUERY +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk + rows between current row and unbounded following) from tpcds.store; +---- RESULTS +12,1 +8,2 +5,3 +11,1 +10,2 +9,3 +7,4 +6,5 +4,6 +3,7 +2,8 +1,9 +---- TYPES +int,bigint +==== +---- QUERY +select s_store_sk, regr_count(s_number_employees, s_floor_space) over (partition by s_city order by s_store_sk + rows between unbounded preceding and current row) from tpcds.store; +---- RESULTS +5,1 +8,2 +12,3 +1,1 +2,2 +3,3 +4,4 +6,5 +7,6 +9,7 +10,8 +11,9 +---- TYPES +int,bigint +====