This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8acce888596 [SPARK-37681][SQL] Support ANSI Aggregate Function:
regr_sxy
8acce888596 is described below
commit 8acce8885964fcb5b23046bdac8a882d723f35cc
Author: Jiaan Geng <[email protected]>
AuthorDate: Wed Apr 20 21:58:10 2022 +0800
[SPARK-37681][SQL] Support ANSI Aggregate Function: regr_sxy
### What changes were proposed in this pull request?
This PR used to support ANSI aggregate Function: `regr_sxy`
The mainstream database supports `regr_sxy` show below:
**Teradata**
https://docs.teradata.com/r/kmuOwjp1zEYg98JsB8fu_A/MXr3jFyWutZ9J4fhlXv_fQ
**Snowflake**
https://docs.snowflake.com/en/sql-reference/functions/regr_sxy.html
**Oracle**
https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/REGR_-Linear-Regression-Functions.html#GUID-A675B68F-2A88-4843-BE2C-FCDE9C65F9A9
**DB2**
https://www.ibm.com/docs/en/db2/11.5?topic=af-regression-functions-regr-avgx-regr-avgy-regr-count
**H2**
http://www.h2database.com/html/functions-aggregate.html#regr_sxy
**Postgresql**
https://www.postgresql.org/docs/8.4/functions-aggregate.html
**Sybase**
https://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.help.sqlanywhere.12.0.0/dbreference/regr-sxy-function.html
**Exasol**
https://docs.exasol.com/sql_references/functions/alphabeticallistfunctions/regr_function.htm
### Why are the changes needed?
`regr_sxy` is very useful.
### Does this PR introduce _any_ user-facing change?
'Yes'. New feature.
### How was this patch tested?
New tests.
Closes #34964 from beliefer/SPARK-37681.
Authored-by: Jiaan Geng <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/analysis/FunctionRegistry.scala | 1 +
.../expressions/aggregate/linearRegression.scala | 29 ++++++++++++++++--
.../sql-functions/sql-expression-schema.md | 3 +-
.../test/resources/sql-tests/inputs/group-by.sql | 6 ++++
.../inputs/postgreSQL/aggregates_part1.sql | 2 +-
.../inputs/udf/postgreSQL/udf-aggregates_part1.sql | 2 +-
.../resources/sql-tests/results/group-by.sql.out | 35 +++++++++++++++++++++-
.../results/postgreSQL/aggregates_part1.sql.out | 10 ++++++-
.../udf/postgreSQL/udf-aggregates_part1.sql.out | 10 ++++++-
9 files changed, 90 insertions(+), 8 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 47fdca8ebe4..f7af5b35a3b 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -502,6 +502,7 @@ object FunctionRegistry {
expression[RegrAvgY]("regr_avgy"),
expression[RegrR2]("regr_r2"),
expression[RegrSXX]("regr_sxx"),
+ expression[RegrSXY]("regr_sxy"),
// string functions
expression[Ascii]("ascii"),
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
index 098fc17b98a..568c186f06d 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
@@ -139,7 +139,7 @@ case class RegrAvgY(
group = "agg_funcs",
since = "3.3.0")
// scalastyle:on line.size.limit
-case class RegrR2(x: Expression, y: Expression) extends PearsonCorrelation(x,
y, true) {
+case class RegrR2(y: Expression, x: Expression) extends PearsonCorrelation(y,
x, true) {
override def prettyName: String = "regr_r2"
override val evaluateExpression: Expression = {
val corr = ck / sqrt(xMk * yMk)
@@ -148,7 +148,7 @@ case class RegrR2(x: Expression, y: Expression) extends
PearsonCorrelation(x, y,
}
override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): RegrR2 =
- this.copy(x = newLeft, y = newRight)
+ this.copy(y = newLeft, x = newRight)
}
// scalastyle:off line.size.limit
@@ -181,3 +181,28 @@ case class RegrSXX(
newLeft: Expression, newRight: Expression): RegrSXX =
this.copy(left = newLeft, right = newRight)
}
+
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage = "_FUNC_(y, x) - Returns REGR_COUNT(y, x) * COVAR_POP(y, x) for
non-null pairs in a group, where `y` is the dependent variable and `x` is the
independent variable.",
+ examples = """
+ Examples:
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x);
+ 0.75
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS
tab(y, x);
+ 1.0
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4)
AS tab(y, x);
+ 1.0
+ """,
+ group = "agg_funcs",
+ since = "3.4.0")
+// scalastyle:on line.size.limit
+case class RegrSXY(y: Expression, x: Expression) extends Covariance(y, x,
true) {
+ override def prettyName: String = "regr_sxy"
+ override val evaluateExpression: Expression = {
+ If(n === 0.0, Literal.create(null, DoubleType), ck)
+ }
+ override protected def withNewChildrenInternal(
+ newLeft: Expression, newRight: Expression): RegrSXY =
+ this.copy(y = newLeft, x = newRight)
+}
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index a67c402f4fe..accf9ea4577 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -1,6 +1,6 @@
<!-- Automatically generated by ExpressionsSchemaSuite -->
## Summary
- - Number of queries: 389
+ - Number of queries: 390
- Number of expressions that missing example: 12
- Expressions missing examples:
bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint
## Schema of Built-in Functions
@@ -375,6 +375,7 @@
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrCount | regr_count |
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)
| struct<regr_count(y, x):bigint> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrR2 | regr_r2 |
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_r2(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSXX | regr_sxx |
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_sxx(y, x):double> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSXY | regr_sxy |
SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_sxy(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.Skewness | skewness |
SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) |
struct<skewness(col):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop | stddev_pop |
SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col) |
struct<stddev_pop(col):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | std |
SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<std(col):double>
|
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index dd10d7b5d1c..b45538b498d 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -276,6 +276,12 @@ SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT
NULL AND y IS NOT NULL;
SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k;
SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k;
+-- SPARK-37681: Support ANSI Aggregate Function: regr_sxy
+SELECT regr_sxy(y, x) FROM testRegression;
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL;
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k;
+
-- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
SELECT
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
diff --git
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
index 2ae37f148ff..58def6729c8 100644
---
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
+++
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql
@@ -83,7 +83,7 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
SELECT regr_count(b, a) FROM aggtest;
SELECT regr_sxx(b, a) FROM aggtest;
-- SELECT regr_syy(b, a) FROM aggtest;
--- SELECT regr_sxy(b, a) FROM aggtest;
+SELECT regr_sxy(b, a) FROM aggtest;
SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
diff --git
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
index b20b85193b9..525bfa8b0b0 100644
---
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
+++
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql
@@ -83,7 +83,7 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
SELECT regr_count(b, a) FROM aggtest;
SELECT regr_sxx(b, a) FROM aggtest;
-- SELECT regr_syy(b, a) FROM aggtest;
--- SELECT regr_sxy(b, a) FROM aggtest;
+SELECT regr_sxy(b, a) FROM aggtest;
SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
SELECT regr_r2(b, a) FROM aggtest;
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index bad431383ff..89895f9b369 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 105
+-- Number of queries: 109
-- !query
@@ -1005,6 +1005,39 @@ struct<k:int,regr_sxx(y, x):double>
2 288.66666666666663
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+1 NULL
+2 240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+2 240.0
+
+
-- !query
SELECT
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
diff --git
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
index 529438940b4..f0caf32afeb 100644
---
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 48
+-- Number of queries: 49
-- !query
@@ -304,6 +304,14 @@ struct<regr_sxx(b, a):double>
5099.0
+-- !query
+SELECT regr_sxy(b, a) FROM aggtest
+-- !query schema
+struct<regr_sxy(b, a):double>
+-- !query output
+2614.5158215500414
+
+
-- !query
SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest
-- !query schema
diff --git
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
index fba60af8219..4733fe13ff7 100644
---
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 47
+-- Number of queries: 48
-- !query
@@ -295,6 +295,14 @@ struct<regr_sxx(b, a):double>
5099.0
+-- !query
+SELECT regr_sxy(b, a) FROM aggtest
+-- !query schema
+struct<regr_sxy(b, a):double>
+-- !query output
+2614.5158215500414
+
+
-- !query
SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest
-- !query schema
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]