This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit decf1c2924011e40d68aefd89f84ad3fa2080196
Author: pranavyl <pranav.lo...@cloudera.com>
AuthorDate: Fri Dec 8 19:11:28 2023 +0100

    IMPALA-12744: Support for regr_count() aggregate/analytic function
    
    The regr_count() function can be used both as an aggregate and
    an analytic function and is commonly used in regression analysis.
    
    regr_count(y, x) returns an integer that is the number of non-null
    number pairs. It indicates how many observations are included in the
    analysis.
    
    Testing:
    The functions are extensively tested and cross-checked with Hive.
    The tests can be found in aggregation.test.
    
    Change-Id: I773d1e0edc8a9c8ee003f75721f4844685b2eb38
    Reviewed-on: http://gerrit.cloudera.org:8080/20936
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/exprs/aggregate-functions-ir.cc             |  33 +++
 be/src/exprs/aggregate-functions.h                 |  10 +
 .../java/org/apache/impala/catalog/BuiltinsDb.java |  19 ++
 .../queries/QueryTest/aggregation.test             | 243 +++++++++++++++++++++
 4 files changed, 305 insertions(+)

diff --git a/be/src/exprs/aggregate-functions-ir.cc 
b/be/src/exprs/aggregate-functions-ir.cc
index bf41acb04..c9e4b0439 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -288,6 +288,39 @@ void AggregateFunctions::CountMerge(FunctionContext*, 
const BigIntVal& src,
   dst->val += src.val;
 }
 
+// regr_count(y, x) returns an integer that is the number of non-null
+// number pairs. It indicates how many observations are included in the
+// analysis.
+void AggregateFunctions::RegrCountUpdate(
+    FunctionContext*, const DoubleVal& src1, const DoubleVal& src2, BigIntVal* 
dst) {
+  DCHECK(!dst->is_null);
+  if (!src1.is_null && !src2.is_null) ++dst->val;
+}
+
+void AggregateFunctions::RegrCountRemove(
+    FunctionContext*, const DoubleVal& src1, const DoubleVal& src2, BigIntVal* 
dst) {
+  DCHECK(!dst->is_null);
+  if (!src1.is_null && !src2.is_null) {
+    --dst->val;
+    DCHECK_GE(dst->val, 0);
+  }
+}
+
+void AggregateFunctions::TimestampRegrCountUpdate(FunctionContext*,
+    const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst) {
+  DCHECK(!dst->is_null);
+  if (!src1.is_null && !src2.is_null) ++dst->val;
+}
+
+void AggregateFunctions::TimestampRegrCountRemove(FunctionContext*,
+    const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst) {
+  DCHECK(!dst->is_null);
+  if (!src1.is_null && !src2.is_null) {
+    --dst->val;
+    DCHECK_GE(dst->val, 0);
+  }
+}
+
 // Implementation of regr_slope() and regr_intercept():
 // RegrSlopeState is used for implementing regr_slope() and regr_intercept().
 // regr_slope() and regr_intercept() take two arguments of numeric type and 
return the
diff --git a/be/src/exprs/aggregate-functions.h 
b/be/src/exprs/aggregate-functions.h
index c261e80aa..13aadb8e9 100644
--- a/be/src/exprs/aggregate-functions.h
+++ b/be/src/exprs/aggregate-functions.h
@@ -66,6 +66,16 @@ class AggregateFunctions {
   static StringVal StringValSerializeOrFinalize(
       FunctionContext* ctx, const StringVal& src);
 
+  /// Implementation of Regr_Count()
+  static void RegrCountUpdate(FunctionContext*, const DoubleVal& src1,
+      const DoubleVal& src2, BigIntVal* dst);
+  static void RegrCountRemove(FunctionContext*, const DoubleVal& src1,
+      const DoubleVal& src2, BigIntVal* dst);
+  static void TimestampRegrCountUpdate(FunctionContext*,
+      const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst);
+  static void TimestampRegrCountRemove(FunctionContext*,
+      const TimestampVal& src1, const TimestampVal& src2, BigIntVal* dst);
+
   /// Implementation of regr_slope() and regr_intercept()
   static void RegrSlopeInit(FunctionContext* ctx, StringVal* dst);
   static void RegrSlopeUpdate(FunctionContext* ctx, const DoubleVal& src1,
diff --git a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java 
b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java
index 1f1532761..69b3b6109 100644
--- a/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java
+++ b/fe/src/main/java/org/apache/impala/catalog/BuiltinsDb.java
@@ -1367,6 +1367,25 @@ public class BuiltinsDb extends Db {
         prefix + 
"12CorrFinalizeEPN10impala_udf15FunctionContextERKNS1_9StringValE",
         false, true, false));
 
+    // Regr_count()
+    db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_count",
+        Lists.<Type>newArrayList(Type.DOUBLE, Type.DOUBLE), Type.BIGINT, 
Type.BIGINT,
+        prefix + 
"8InitZeroIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextEPT_",
+        prefix + 
"15RegrCountUpdateEPN10impala_udf15FunctionContextERKNS1_9DoubleValES6_PNS1_9BigIntValE",
+        prefix + 
"10CountMergeEPN10impala_udf15FunctionContextERKNS1_9BigIntValEPS4_",
+        null, null,
+        prefix + 
"15RegrCountRemoveEPN10impala_udf15FunctionContextERKNS1_9DoubleValES6_PNS1_9BigIntValE",
+        null, false, true, true));
+
+    db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_count",
+        Lists.<Type>newArrayList(Type.TIMESTAMP, Type.TIMESTAMP), Type.BIGINT, 
Type.BIGINT,
+        prefix + 
"8InitZeroIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextEPT_",
+        prefix + 
"24TimestampRegrCountUpdateEPN10impala_udf15FunctionContextERKNS1_12TimestampValES6_PNS1_9BigIntValE",
+        prefix + 
"10CountMergeEPN10impala_udf15FunctionContextERKNS1_9BigIntValEPS4_",
+        null, null,
+        prefix + 
"24TimestampRegrCountRemoveEPN10impala_udf15FunctionContextERKNS1_12TimestampValES6_PNS1_9BigIntValE",
+        null, false, true, true));
+
     // Regr_r2()
     db.addBuiltin(AggregateFunction.createBuiltin(db, "regr_r2",
         Lists.<Type>newArrayList(Type.DOUBLE, Type.DOUBLE), Type.DOUBLE, 
Type.STRING,
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test 
b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
index df8162d14..39c4d7d75 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
@@ -2667,3 +2667,246 @@ select s_store_sk, regr_r2(s_floor_space, 
s_number_employees) over (partition by
 ---- TYPES
 int, double
 ====
+---- QUERY
+# regr_count() function examples
+select regr_count(ps_availqty, ps_supplycost) from tpch.partsupp;
+---- RESULTS
+800000
+---- TYPES
+bigint
+====
+---- QUERY
+# Behavior of regr_count() on null table
+select regr_count(d, e) from functional.nulltable;
+---- RESULTS
+0
+---- TYPES
+bigint
+====
+---- QUERY
+# Behavior of regr_count() on empty table
+select regr_count(f2, f2) from functional.emptytable;
+---- RESULTS
+0
+---- TYPES
+bigint
+====
+---- QUERY
+# regr_count() on different datatypes
+select regr_count(tinyint_col, tinyint_col), regr_count(smallint_col, 
smallint_col),
+ regr_count(int_col, int_col), regr_count(bigint_col, bigint_col), 
regr_count(float_col, float_col),
+ regr_count(double_col, double_col), regr_count(timestamp_col, timestamp_col) 
from functional.alltypes;
+---- RESULTS
+7300,7300,7300,7300,7300,7300,7300
+---- TYPES
+bigint,bigint,bigint,bigint,bigint,bigint,bigint
+====
+---- QUERY
+# regr_count() on timestamp columns
+select regr_count(utctime, localtime) from functional.alltimezones;
+---- RESULTS
+857
+---- TYPES
+bigint
+====
+---- QUERY
+# Since group by id will result in a single row, this test shows that 
regr_count() returns 1 in case of a single row.
+select id, regr_count(int_col, int_col) from functional.alltypestiny group by 
id;
+---- RESULTS
+2,1
+4,1
+0,1
+6,1
+1,1
+7,1
+3,1
+5,1
+---- TYPES
+int,bigint
+====
+---- QUERY
+# regr_count() on decimal datatype
+select regr_count(d3, d4) from functional.decimal_tbl;
+---- RESULTS
+5
+---- TYPES
+bigint
+====
+---- QUERY
+select year, regr_count(double_col, double_col) from functional.alltypes group 
by year;
+---- RESULTS
+2009,3650
+2010,3650
+---- TYPES
+int,bigint
+====
+---- QUERY
+select regr_count(double_col, -double_col) from functional.alltypes;
+---- RESULTS
+7300
+---- TYPES
+bigint
+====
+---- QUERY
+select regr_count(double_col, double_col) from functional.alltypes;
+---- RESULTS
+7300
+---- TYPES
+bigint
+====
+---- QUERY
+select regr_count(ss_sold_time_sk, ss_quantity) from tpcds.store_sales;
+---- RESULTS
+2685616
+---- TYPES
+bigint
+====
+---- QUERY
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk)
+ from tpcds.store;
+---- RESULTS
+5,1
+8,2
+12,3
+1,1
+2,2
+3,3
+4,4
+6,5
+7,6
+9,7
+10,8
+11,9
+---- TYPES
+int,bigint
+====
+---- QUERY
+select id, double_col, regr_count(double_col, int_col) over (partition by 
month order by id) from functional.alltypes
+ order by id limit 10;
+---- RESULTS
+0,0.0,1
+1,10.1,2
+2,20.2,3
+3,30.3,4
+4,40.4,5
+5,50.5,6
+6,60.6,7
+7,70.7,8
+8,80.8,9
+9,90.9,10
+---- TYPES
+int,double,bigint
+====
+---- QUERY
+# regr_count() when one column is filled with null
+select regr_count(null_int, rand()), regr_count(rand(), null_int) from 
functional.nullrows;
+---- RESULTS
+0,0
+---- TYPES
+bigint,bigint
+====
+---- QUERY
+# regr_count() supporting join
+select regr_count(A.double_col, B.double_col) from functional.alltypes A, 
functional.alltypes B where A.id=B.id;
+---- RESULTS
+7300
+---- TYPES
+bigint
+====
+---- QUERY
+# Tests functioning of regrCountRemove()
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk
+ rows between 5 preceding and 2 following) from tpcds.store;
+---- RESULTS
+5,3
+8,3
+12,3
+1,3
+2,4
+3,5
+4,6
+6,7
+7,8
+9,8
+10,7
+11,6
+---- TYPES
+int,bigint
+====
+---- QUERY
+# regr_count() when window size is 2
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk
+ rows between 1 preceding and current row) from tpcds.store;
+---- RESULTS
+5,1
+8,2
+12,2
+1,1
+2,2
+3,2
+4,2
+6,2
+7,2
+9,2
+10,2
+11,2
+---- TYPES
+int,bigint
+====
+---- QUERY
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk
+ rows between current row and 1 following) from tpcds.store;
+---- RESULTS
+5,2
+8,2
+12,1
+1,2
+2,2
+3,2
+4,2
+6,2
+7,2
+9,2
+10,2
+11,1
+---- TYPES
+int,bigint
+====
+---- QUERY
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk
+ rows between current row and unbounded following) from tpcds.store;
+---- RESULTS
+12,1
+8,2
+5,3
+11,1
+10,2
+9,3
+7,4
+6,5
+4,6
+3,7
+2,8
+1,9
+---- TYPES
+int,bigint
+====
+---- QUERY
+select s_store_sk, regr_count(s_number_employees, s_floor_space) over 
(partition by s_city order by s_store_sk
+ rows between unbounded preceding and current row) from tpcds.store;
+---- RESULTS
+5,1
+8,2
+12,3
+1,1
+2,2
+3,3
+4,4
+6,5
+7,6
+9,7
+10,8
+11,9
+---- TYPES
+int,bigint
+====

Reply via email to