This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 73789da [SPARK-37676][SQL] Support ANSI Aggregation Function: percentile_cont 73789da is described below commit 73789da962c9037bde21a53fb5826b10475658fe Author: Jiaan Geng <belie...@163.com> AuthorDate: Mon Dec 27 16:12:43 2021 +0300 [SPARK-37676][SQL] Support ANSI Aggregation Function: percentile_cont ### What changes were proposed in this pull request? `PERCENTILE_CONT` is an ANSI aggregate functions. The mainstream database supports `percentile_cont` show below: **Postgresql** https://www.postgresql.org/docs/9.4/functions-aggregate.html **Teradata** https://docs.teradata.com/r/kmuOwjp1zEYg98JsB8fu_A/cPkFySIBORL~M938Zv07Cg **Snowflake** https://docs.snowflake.com/en/sql-reference/functions/percentile_cont.html **Oracle** https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/PERCENTILE_CONT.html#GUID-CA259452-A565-41B3-A4F4-DD74B66CEDE0 **H2** http://www.h2database.com/html/functions-aggregate.html#percentile_cont **Sybase** https://infocenter.sybase.com/help/index.jsp?topic=/com.sybase.infocenter.dc01776.1601/doc/html/san1278453109663.html **Exasol** https://docs.exasol.com/sql_references/functions/alphabeticallistfunctions/percentile_cont.htm **RedShift** https://docs.aws.amazon.com/redshift/latest/dg/r_PERCENTILE_CONT.html **Yellowbrick** https://www.yellowbrick.com/docs/2.2/ybd_sqlref/percentile_cont.html **Mariadb** https://mariadb.com/kb/en/percentile_cont/ **Phoenix** http://phoenix.incubator.apache.org/language/functions.html#percentile_cont **Singlestore** https://docs.singlestore.com/db/v7.6/en/reference/sql-reference/window-functions/percentile_cont-and-median.html ### Why are the changes needed? `PERCENTILE_CONT` is very useful. Exposing the expression can make the migration from other systems to Spark SQL easier. ### Does this PR introduce _any_ user-facing change? 'Yes'. New feature. ### How was this patch tested? New tests. Closes #34936 from beliefer/SPARK-37676. Authored-by: Jiaan Geng <belie...@163.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- docs/sql-ref-ansi-compliance.md | 2 ++ .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 6 ++++ .../spark/sql/catalyst/parser/AstBuilder.scala | 15 +++++++- .../sql/catalyst/parser/PlanParserSuite.scala | 24 ++++++++++++- .../test/resources/sql-tests/inputs/group-by.sql | 18 +++++++++- .../resources/sql-tests/results/group-by.sql.out | 41 +++++++++++++++++++++- 6 files changed, 102 insertions(+), 4 deletions(-) diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index 7b5bde4..1b4a778 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -494,6 +494,7 @@ Below is a list of all the keywords in Spark SQL. |PARTITIONED|non-reserved|non-reserved|non-reserved| |PARTITIONS|non-reserved|non-reserved|non-reserved| |PERCENT|non-reserved|non-reserved|non-reserved| +|PERCENTILE_CONT|reserved|non-reserved|non-reserved| |PIVOT|non-reserved|non-reserved|non-reserved| |PLACING|non-reserved|non-reserved|non-reserved| |POSITION|non-reserved|non-reserved|reserved| @@ -594,5 +595,6 @@ Below is a list of all the keywords in Spark SQL. |WHERE|reserved|non-reserved|reserved| |WINDOW|non-reserved|non-reserved|reserved| |WITH|reserved|non-reserved|reserved| +|WITHIN|reserved|non-reserved|reserved| |YEAR|non-reserved|non-reserved|non-reserved| |ZONE|non-reserved|non-reserved|non-reserved| diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 6511489..5037520 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -888,6 +888,8 @@ primaryExpression FROM srcStr=valueExpression ')' #trim | OVERLAY '(' input=valueExpression PLACING replace=valueExpression FROM position=valueExpression (FOR length=valueExpression)? ')' #overlay + | PERCENTILE_CONT '(' percentage=valueExpression ')' + WITHIN GROUP '(' ORDER BY sortItem ')' #percentile ; constant @@ -1475,6 +1477,7 @@ nonReserved | PARTITION | PARTITIONED | PARTITIONS + | PERCENTILE_CONT | PERCENTLIT | PIVOT | PLACING @@ -1570,6 +1573,7 @@ nonReserved | WHERE | WINDOW | WITH + | WITHIN | YEAR | ZONE //--DEFAULT-NON-RESERVED-END @@ -1747,6 +1751,7 @@ OVERWRITE: 'OVERWRITE'; PARTITION: 'PARTITION'; PARTITIONED: 'PARTITIONED'; PARTITIONS: 'PARTITIONS'; +PERCENTILE_CONT: 'PERCENTILE_CONT'; PERCENTLIT: 'PERCENT'; PIVOT: 'PIVOT'; PLACING: 'PLACING'; @@ -1847,6 +1852,7 @@ WHEN: 'WHEN'; WHERE: 'WHERE'; WINDOW: 'WINDOW'; WITH: 'WITH'; +WITHIN: 'WITHIN'; YEAR: 'YEAR'; ZONE: 'ZONE'; //--SPARK-KEYWORD-LIST-END diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala index c6d7d5d..8edcce7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, SQLConfHelper, TableId import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last} +import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last, Percentile} import org.apache.spark.sql.catalyst.parser.SqlBaseParser._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -1829,6 +1829,19 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** + * Create a Percentile expression. + */ + override def visitPercentile(ctx: PercentileContext): Expression = withOrigin(ctx) { + val percentage = expression(ctx.percentage) + val sortOrder = visitSortItem(ctx.sortItem) + val percentile = sortOrder.direction match { + case Ascending => new Percentile(sortOrder.child, percentage) + case Descending => new Percentile(sortOrder.child, Subtract(Literal(1), percentage)) + } + percentile.toAggregateExpression() + } + + /** * Create a Substring/Substr expression. */ override def visitSubstring(ctx: SubstringContext): Expression = withOrigin(ctx) { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala index 76c620d..e8088a6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala @@ -20,10 +20,11 @@ package org.apache.spark.sql.catalyst.parser import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, RelationTimeTravel, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction, UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedStar, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.aggregate.Percentile import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{IntegerType, LongType, StringType} +import org.apache.spark.sql.types.{Decimal, DecimalType, IntegerType, LongType, StringType} /** * Parser test cases for rules defined in [[CatalystSqlParser]] / [[AstBuilder]]. @@ -1299,4 +1300,25 @@ class PlanParserSuite extends AnalysisTest { intercept("SELECT * FROM a.b.c TIMESTAMP AS OF (select 1)", "timestamp expression cannot contain subqueries") } + + test("PERCENTILE_CONT function") { + def assertPercentileContPlans(inputSQL: String, expectedExpression: Expression): Unit = { + comparePlans( + parsePlan(inputSQL), + Project(Seq(UnresolvedAlias(expectedExpression)), OneRowRelation()) + ) + } + + assertPercentileContPlans( + "SELECT PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY col)", + new Percentile(UnresolvedAttribute("col"), Literal(Decimal(0.1), DecimalType(1, 1))) + .toAggregateExpression() + ) + + assertPercentileContPlans( + "SELECT PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY col DESC)", + new Percentile(UnresolvedAttribute("col"), + Subtract(Literal(1), Literal(Decimal(0.1), DecimalType(1, 1)))).toAggregateExpression() + ) + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 1c29177..cb82bfa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -10,6 +10,9 @@ AS testData(a, b); CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES (1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35) AS testRegression(k, y, x); +CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES +(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null) +AS aggr(k, v); -- Aggregate with empty GroupBy expressions. SELECT a, COUNT(b) FROM testData; @@ -226,4 +229,17 @@ SELECT array_agg(b) FROM VALUES (1,4),(2,3),(1,4),(2,4) AS v(a,b) -GROUP BY a; \ No newline at end of file +GROUP BY a; + +-- SPARK-37676: Support ANSI Aggregation Function: percentile_cont +SELECT + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr; +SELECT + k, + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 73708f9..dce5fd0 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 73 +-- Number of queries: 74 -- !query @@ -23,6 +23,16 @@ struct<> -- !query +CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES +(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null) +AS aggr(k, v) +-- !query schema +struct<> +-- !query output + + + +-- !query SELECT a, COUNT(b) FROM testData -- !query schema struct<> @@ -762,3 +772,32 @@ struct<a:int,collect_list(b):array<int>,collect_list(b):array<int>> -- !query output 1 [4,4] [4,4] 2 [3,4] [3,4] + + +-- !query +SELECT + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +-- !query schema +struct<percentile(v, 0.25, 1):double,percentile(v, (1 - 0.25), 1):double> +-- !query output +10.0 30.0 + + +-- !query +SELECT + k, + percentile_cont(0.25) WITHIN GROUP (ORDER BY v), + percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC) +FROM aggr +GROUP BY k +ORDER BY k +-- !query schema +struct<k:int,percentile(v, 0.25, 1):double,percentile(v, (1 - 0.25), 1):double> +-- !query output +0 10.0 30.0 +1 12.5 17.5 +2 17.5 26.25 +3 60.0 60.0 +4 NULL NULL --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org