[spark] branch master updated: [SPARK-37672][SQL] Support ANSI Aggregate Function: regr_sxx

wenchen Tue, 19 Apr 2022 00:36:05 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new e1e110f53fc [SPARK-37672][SQL] Support ANSI Aggregate Function: 
regr_sxx
e1e110f53fc is described below

commit e1e110f53fc980eb30b2684544eeb97b7acd3f45
Author: Jiaan Geng <[email protected]>
AuthorDate: Tue Apr 19 15:35:24 2022 +0800

    [SPARK-37672][SQL] Support ANSI Aggregate Function: regr_sxx
    
    ### What changes were proposed in this pull request?
    This PR used to support ANSI aggregate Function: `regr_sxx`
    
    The mainstream database supports `regr_sxx` show below:
    **Teradata**
    https://docs.teradata.com/r/kmuOwjp1zEYg98JsB8fu_A/PBEW1OPIaxqkIf3CJfIr6A
    **Snowflake**
    https://docs.snowflake.com/en/sql-reference/functions/regr_sxx.html
    **Oracle**
    
https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/REGR_-Linear-Regression-Functions.html#GUID-A675B68F-2A88-4843-BE2C-FCDE9C65F9A9
    **DB2**
    
https://www.ibm.com/docs/en/db2/11.5?topic=af-regression-functions-regr-avgx-regr-avgy-regr-count
    **H2**
    http://www.h2database.com/html/functions-aggregate.html#regr_sxx
    **Postgresql**
    https://www.postgresql.org/docs/8.4/functions-aggregate.html
    **Sybase**
    
https://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.help.sqlanywhere.12.0.0/dbreference/regr-sxx-function.html
    **Exasol**
    
https://docs.exasol.com/sql_references/functions/alphabeticallistfunctions/regr_function.htm
    
    ### Why are the changes needed?
    `regr_sxx` is very useful.
    
    ### Does this PR introduce _any_ user-facing change?
    'Yes'. New feature.
    
    ### How was this patch tested?
    New tests.
    
    Closes #34943 from beliefer/SPARK-37672.
    
    Authored-by: Jiaan Geng <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../sql/catalyst/analysis/FunctionRegistry.scala   |  1 +
 .../expressions/aggregate/CentralMomentAgg.scala   | 13 ++++++++
 .../expressions/aggregate/linearRegression.scala   | 33 +++++++++++++++++++-
 .../sql-functions/sql-expression-schema.md         |  3 +-
 .../test/resources/sql-tests/inputs/group-by.sql   |  6 ++++
 .../inputs/postgreSQL/aggregates_part1.sql         |  2 +-
 .../inputs/udf/postgreSQL/udf-aggregates_part1.sql |  2 +-
 .../resources/sql-tests/results/group-by.sql.out   | 35 +++++++++++++++++++++-
 .../results/postgreSQL/aggregates_part1.sql.out    | 10 ++++++-
 .../udf/postgreSQL/udf-aggregates_part1.sql.out    | 10 ++++++-
 10 files changed, 108 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 80374f769a2..47fdca8ebe4 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -501,6 +501,7 @@ object FunctionRegistry {
     expression[RegrAvgX]("regr_avgx"),
     expression[RegrAvgY]("regr_avgy"),
     expression[RegrR2]("regr_r2"),
+    expression[RegrSXX]("regr_sxx"),
 
     // string functions
     expression[Ascii]("ascii"),
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
index c5c78e5062f..a40c5e4815f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
@@ -264,6 +264,19 @@ case class VarianceSamp(
     copy(child = newChild)
 }
 
+case class RegrSXXReplacement(child: Expression)
+  extends CentralMomentAgg(child, !SQLConf.get.legacyStatisticalAggregate) {
+
+  override protected def momentOrder = 2
+
+  override val evaluateExpression: Expression = {
+    If(n === 0.0, Literal.create(null, DoubleType), m2)
+  }
+
+  override protected def withNewChildInternal(newChild: Expression): 
RegrSXXReplacement =
+    copy(child = newChild)
+}
+
 @ExpressionDescription(
   usage = "_FUNC_(expr) - Returns the skewness value calculated from values of 
a group.",
   examples = """
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
index 7463ef59c78..4c1749fa00e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions.aggregate
 
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.expressions.{And, Expression, 
ExpressionDescription, If, ImplicitCastInputTypes, IsNotNull, Literal, 
RuntimeReplaceableAggregate}
+import org.apache.spark.sql.catalyst.expressions.{And, Expression, 
ExpressionDescription, If, ImplicitCastInputTypes, IsNotNull, IsNull, Literal, 
Or, RuntimeReplaceableAggregate}
 import org.apache.spark.sql.catalyst.trees.BinaryLike
 import org.apache.spark.sql.types.{AbstractDataType, DoubleType, NumericType}
 
@@ -150,3 +150,34 @@ case class RegrR2(x: Expression, y: Expression) extends 
PearsonCorrelation(x, y,
       newLeft: Expression, newRight: Expression): RegrR2 =
     this.copy(x = newLeft, y = newRight)
 }
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(y, x) - Returns REGR_COUNT(y, x) * VAR_POP(x) for non-null 
pairs in a group, where `y` is the dependent variable and `x` is the 
independent variable.",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS 
tab(y, x);
+       2.75
+      > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS 
tab(y, x);
+       2.0
+      > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) 
AS tab(y, x);
+       2.0
+  """,
+  group = "agg_funcs",
+  since = "3.4.0")
+// scalastyle:on line.size.limit
+case class RegrSXX(
+    left: Expression,
+    right: Expression)
+  extends AggregateFunction
+    with RuntimeReplaceableAggregate
+    with ImplicitCastInputTypes
+    with BinaryLike[Expression] {
+  override lazy val replacement: Expression =
+    RegrSXXReplacement(If(Or(IsNull(left), IsNull(right)), 
Literal.create(null, DoubleType), right))
+  override def nodeName: String = "regr_sxx"
+  override def inputTypes: Seq[DoubleType] = Seq(DoubleType, DoubleType)
+  override protected def withNewChildrenInternal(
+      newLeft: Expression, newRight: Expression): RegrSXX =
+    this.copy(left = newLeft, right = newRight)
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md 
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 9f8faf517a4..a67c402f4fe 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -1,6 +1,6 @@
 <!-- Automatically generated by ExpressionsSchemaSuite -->
 ## Summary
-  - Number of queries: 388
+  - Number of queries: 389
   - Number of expressions that missing example: 12
   - Expressions missing examples: 
bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint
 ## Schema of Built-in Functions
@@ -374,6 +374,7 @@
 | org.apache.spark.sql.catalyst.expressions.aggregate.RegrAvgY | regr_avgy | 
SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) 
| struct<regr_avgy(y, x):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.RegrCount | regr_count | 
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) 
| struct<regr_count(y, x):bigint> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.RegrR2 | regr_r2 | 
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) | 
struct<regr_r2(y, x):double> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSXX | regr_sxx | 
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) | 
struct<regr_sxx(y, x):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.Skewness | skewness | 
SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) | 
struct<skewness(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop | stddev_pop | 
SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col) | 
struct<stddev_pop(col):double> |
 | org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | std | 
SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<std(col):double> 
|
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index b97b24140ed..0fc9366a92a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -270,6 +270,12 @@ SELECT regr_avgx(y, x), regr_avgy(y, x) FROM 
testRegression WHERE x IS NOT NULL
 SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression 
GROUP BY k;
 SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER 
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM 
testRegression GROUP BY k;
 
+-- SPARK-37672: Support ANSI Aggregate Function: regr_sxx
+SELECT regr_sxx(y, x) FROM testRegression;
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL;
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k;
+
 -- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
 SELECT
  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql 
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
index 848ca433346..2ae37f148ff 100644
--- 
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
@@ -81,7 +81,7 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
 
 -- SQL2003 binary aggregates [SPARK-23907]
 SELECT regr_count(b, a) FROM aggtest;
--- SELECT regr_sxx(b, a) FROM aggtest;
+SELECT regr_sxx(b, a) FROM aggtest;
 -- SELECT regr_syy(b, a) FROM aggtest;
 -- SELECT regr_sxy(b, a) FROM aggtest;
 SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
 
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
index 7d0030ba0c5..b20b85193b9 100644
--- 
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
+++ 
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
@@ -81,7 +81,7 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
 
 -- SQL2003 binary aggregates [SPARK-23907]
 SELECT regr_count(b, a) FROM aggtest;
--- SELECT regr_sxx(b, a) FROM aggtest;
+SELECT regr_sxx(b, a) FROM aggtest;
 -- SELECT regr_syy(b, a) FROM aggtest;
 -- SELECT regr_sxy(b, a) FROM aggtest;
 SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index e9f650edfcf..75bf4f1c2f7 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 99
+-- Number of queries: 103
 
 
 -- !query
@@ -972,6 +972,39 @@ struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y 
IS NOT NULL))):double,
 2      22.666666666666668      20.0    22.666666666666668      20.0
 
 
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+1      NULL
+2      288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+2      288.66666666666663
+
+
 -- !query
 SELECT
  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
diff --git 
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
index 5f4ea7aef70..529438940b4 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 47
+-- Number of queries: 48
 
 
 -- !query
@@ -296,6 +296,14 @@ struct<regr_count(b, a):bigint>
 4
 
 
+-- !query
+SELECT regr_sxx(b, a) FROM aggtest
+-- !query schema
+struct<regr_sxx(b, a):double>
+-- !query output
+5099.0
+
+
 -- !query
 SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest
 -- !query schema
diff --git 
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
 
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
index f82184933f7..fba60af8219 100644
--- 
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 46
+-- Number of queries: 47
 
 
 -- !query
@@ -287,6 +287,14 @@ struct<regr_count(b, a):bigint>
 4
 
 
+-- !query
+SELECT regr_sxx(b, a) FROM aggtest
+-- !query schema
+struct<regr_sxx(b, a):double>
+-- !query output
+5099.0
+
+
 -- !query
 SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest
 -- !query schema


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-37672][SQL] Support ANSI Aggregate Function: regr_sxx

Reply via email to