This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push:
new 5c5a68c03cf [SPARK-38219][SPARK-37691][3.3] Support ANSI Aggregation
Function: percentile_cont and percentile_disc
5c5a68c03cf is described below
commit 5c5a68c03cf06a0e3b3b2f24cbd4841c489b89dc
Author: Jiaan Geng <[email protected]>
AuthorDate: Wed Apr 20 21:18:11 2022 +0800
[SPARK-38219][SPARK-37691][3.3] Support ANSI Aggregation Function:
percentile_cont and percentile_disc
### What changes were proposed in this pull request?
This PR backport https://github.com/apache/spark/pull/35531 and
https://github.com/apache/spark/pull/35041 to branch-3.3
### Why are the changes needed?
`percentile_cont` and `percentile_disc` in Spark3.3 release.
### Does this PR introduce _any_ user-facing change?
'No'.
New feature.
### How was this patch tested?
New tests.
Closes #36277 from beliefer/SPARK-38219_SPARK-37691_backport_3.3.
Authored-by: Jiaan Geng <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
docs/sql-ref-ansi-compliance.md | 1 +
.../spark/sql/catalyst/parser/SqlBaseLexer.g4 | 1 +
.../spark/sql/catalyst/parser/SqlBaseParser.g4 | 5 +-
.../sql/catalyst/analysis/CheckAnalysis.scala | 7 +-
.../{Percentile.scala => percentiles.scala} | 261 ++++++++++++++-------
.../spark/sql/catalyst/parser/AstBuilder.scala | 25 +-
.../expressions/aggregate/PercentileSuite.scala | 22 +-
.../sql/catalyst/parser/PlanParserSuite.scala | 28 ++-
.../test/resources/sql-tests/inputs/group-by.sql | 17 +-
.../inputs/postgreSQL/aggregates_part4.sql | 8 +-
.../inputs/udf/postgreSQL/udf-aggregates_part4.sql | 8 +-
.../src/test/resources/sql-tests/inputs/window.sql | 112 +++++++++
.../resources/sql-tests/results/group-by.sql.out | 39 ++-
.../results/postgreSQL/aggregates_part4.sql.out | 31 ++-
.../udf/postgreSQL/udf-aggregates_part4.sql.out | 31 ++-
.../resources/sql-tests/results/window.sql.out | 229 +++++++++++++++++-
16 files changed, 698 insertions(+), 127 deletions(-)
diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md
index 692ea60b52c..94ef94a5e7b 100644
--- a/docs/sql-ref-ansi-compliance.md
+++ b/docs/sql-ref-ansi-compliance.md
@@ -511,6 +511,7 @@ Below is a list of all the keywords in Spark SQL.
|PARTITIONS|non-reserved|non-reserved|non-reserved|
|PERCENT|non-reserved|non-reserved|non-reserved|
|PERCENTILE_CONT|reserved|non-reserved|non-reserved|
+|PERCENTILE_DISC|reserved|non-reserved|non-reserved|
|PIVOT|non-reserved|non-reserved|non-reserved|
|PLACING|non-reserved|non-reserved|non-reserved|
|POSITION|non-reserved|non-reserved|reserved|
diff --git
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
index 6c731bb02bc..e2c4c5444e5 100644
---
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
+++
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
@@ -258,6 +258,7 @@ PARTITION: 'PARTITION';
PARTITIONED: 'PARTITIONED';
PARTITIONS: 'PARTITIONS';
PERCENTILE_CONT: 'PERCENTILE_CONT';
+PERCENTILE_DISC: 'PERCENTILE_DISC';
PERCENTLIT: 'PERCENT';
PIVOT: 'PIVOT';
PLACING: 'PLACING';
diff --git
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index fe81f0ccb8a..9da39a1a96d 100644
---
a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++
b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -844,8 +844,8 @@ primaryExpression
FROM srcStr=valueExpression RIGHT_PAREN
#trim
| OVERLAY LEFT_PAREN input=valueExpression PLACING replace=valueExpression
FROM position=valueExpression (FOR length=valueExpression)? RIGHT_PAREN
#overlay
- | PERCENTILE_CONT LEFT_PAREN percentage=valueExpression RIGHT_PAREN
- WITHIN GROUP LEFT_PAREN ORDER BY sortItem RIGHT_PAREN
#percentile
+ | name=(PERCENTILE_CONT | PERCENTILE_DISC) LEFT_PAREN
percentage=valueExpression RIGHT_PAREN
+ WITHIN GROUP LEFT_PAREN ORDER BY sortItem RIGHT_PAREN ( OVER
windowSpec)? #percentile
;
constant
@@ -1449,6 +1449,7 @@ nonReserved
| PARTITIONED
| PARTITIONS
| PERCENTILE_CONT
+ | PERCENTILE_DISC
| PERCENTLIT
| PIVOT
| PLACING
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 1c2de771a3d..b714c5f9ceb 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
+import
org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression,
PercentileCont, PercentileDisc}
import org.apache.spark.sql.catalyst.optimizer.{BooleanSimplification,
DecorrelateInnerQuery, InlineCTE}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
@@ -233,6 +233,11 @@ trait CheckAnalysis extends PredicateHelper with
LookupCatalog {
// Only allow window functions with an aggregate expression or an
offset window
// function or a Pandas window UDF.
w.windowFunction match {
+ case agg @ AggregateExpression(_: PercentileCont | _:
PercentileDisc, _, _, _, _)
+ if w.windowSpec.orderSpec.nonEmpty ||
w.windowSpec.frameSpecification !=
+ SpecifiedWindowFrame(RowFrame, UnboundedPreceding,
UnboundedFollowing) =>
+ failAnalysis(
+ s"Cannot specify order by or frame for
'${agg.aggregateFunction.prettyName}'.")
case _: AggregateExpression | _: FrameLessOffsetWindowFunction |
_: AggregateWindowFunction => // OK
case f: PythonUDF if PythonUDF.isWindowPandasUDF(f) => // OK
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
similarity index 69%
rename from
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
rename to
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
index a98585e0ff1..e861fb370ca 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Percentile.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala
@@ -24,93 +24,36 @@ import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import
org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure,
TypeCheckSuccess}
import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.trees.TernaryLike
+import org.apache.spark.sql.catalyst.trees.{BinaryLike, TernaryLike}
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.OpenHashMap
-/**
- * The Percentile aggregate function returns the exact percentile(s) of
numeric column `expr` at
- * the given percentage(s) with value range in [0.0, 1.0].
- *
- * Because the number of elements and their partial order cannot be determined
in advance.
- * Therefore we have to store all the elements in memory, and so notice that
too many elements can
- * cause GC paused and eventually OutOfMemory Errors.
- *
- * @param child child expression that produce numeric column value with
`child.eval(inputRow)`
- * @param percentageExpression Expression that represents a single percentage
value or an array of
- * percentage values. Each percentage value must
be in the range
- * [0.0, 1.0].
- */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
- usage =
- """
- _FUNC_(col, percentage [, frequency]) - Returns the exact percentile
value of numeric
- or ansi interval column `col` at the given percentage. The value of
percentage must be
- between 0.0 and 1.0. The value of frequency should be positive integral
+abstract class PercentileBase extends
TypedImperativeAggregate[OpenHashMap[AnyRef, Long]]
+ with ImplicitCastInputTypes {
- _FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) -
Returns the exact
- percentile value array of numeric column `col` at the given
percentage(s). Each value
- of the percentage array must be between 0.0 and 1.0. The value of
frequency should be
- positive integral
+ val child: Expression
+ val percentageExpression: Expression
+ val frequencyExpression : Expression
- """,
- examples = """
- Examples:
- > SELECT _FUNC_(col, 0.3) FROM VALUES (0), (10) AS tab(col);
- 3.0
- > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS
tab(col);
- [2.5,7.5]
- > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL
'10' MONTH) AS tab(col);
- 5.0
- > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND),
(INTERVAL '10' SECOND) AS tab(col);
- [2000000.0,5000000.0]
- """,
- group = "agg_funcs",
- since = "2.1.0")
-// scalastyle:on line.size.limit
-case class Percentile(
- child: Expression,
- percentageExpression: Expression,
- frequencyExpression : Expression,
- mutableAggBufferOffset: Int = 0,
- inputAggBufferOffset: Int = 0)
- extends TypedImperativeAggregate[OpenHashMap[AnyRef, Long]] with
ImplicitCastInputTypes
- with TernaryLike[Expression] {
+ // Whether to reverse calculate percentile value
+ val reverse: Boolean
- def this(child: Expression, percentageExpression: Expression) = {
- this(child, percentageExpression, Literal(1L), 0, 0)
- }
-
- def this(child: Expression, percentageExpression: Expression, frequency:
Expression) = {
- this(child, percentageExpression, frequency, 0, 0)
- }
-
- override def prettyName: String = "percentile"
-
- override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int):
Percentile =
- copy(mutableAggBufferOffset = newMutableAggBufferOffset)
-
- override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int):
Percentile =
- copy(inputAggBufferOffset = newInputAggBufferOffset)
+ // Whether the value is discrete
+ protected def discrete: Boolean
// Mark as lazy so that percentageExpression is not evaluated during tree
transformation.
@transient
private lazy val returnPercentileArray =
percentageExpression.dataType.isInstanceOf[ArrayType]
@transient
- private lazy val percentages = percentageExpression.eval() match {
+ protected lazy val percentages = percentageExpression.eval() match {
case null => null
case num: Double => Array(num)
case arrayData: ArrayData => arrayData.toDoubleArray()
}
- override def first: Expression = child
- override def second: Expression = percentageExpression
- override def third: Expression = frequencyExpression
-
// Returns null for empty inputs
override def nullable: Boolean = true
@@ -151,7 +94,7 @@ case class Percentile(
}
}
- private def toDoubleValue(d: Any): Double = d match {
+ protected def toDoubleValue(d: Any): Double = d match {
case d: Decimal => d.toDouble
case n: Number => n.doubleValue
}
@@ -204,7 +147,11 @@ case class Percentile(
case intervalType: DayTimeIntervalType => intervalType.ordering
case otherType => QueryExecutionErrors.unsupportedTypeError(otherType)
}
- val sortedCounts =
buffer.toSeq.sortBy(_._1)(ordering.asInstanceOf[Ordering[AnyRef]])
+ val sortedCounts = if (reverse) {
+
buffer.toSeq.sortBy(_._1)(ordering.asInstanceOf[Ordering[AnyRef]].reverse)
+ } else {
+ buffer.toSeq.sortBy(_._1)(ordering.asInstanceOf[Ordering[AnyRef]])
+ }
val accumulatedCounts = sortedCounts.scanLeft((sortedCounts.head._1, 0L)) {
case ((key1, count1), (key2, count2)) => (key2, count1 + count2)
}.tail
@@ -227,40 +174,44 @@ case class Percentile(
/**
* Get the percentile value.
- *
* This function has been based upon similar function from HIVE
* `org.apache.hadoop.hive.ql.udf.UDAFPercentile.getPercentile()`.
*/
- private def getPercentile(aggreCounts: Seq[(AnyRef, Long)], position:
Double): Double = {
+ private def getPercentile(
+ accumulatedCounts: Seq[(AnyRef, Long)], position: Double): Double = {
// We may need to do linear interpolation to get the exact percentile
val lower = position.floor.toLong
val higher = position.ceil.toLong
// Use binary search to find the lower and the higher position.
- val countsArray = aggreCounts.map(_._2).toArray[Long]
- val lowerIndex = binarySearchCount(countsArray, 0, aggreCounts.size, lower
+ 1)
- val higherIndex = binarySearchCount(countsArray, 0, aggreCounts.size,
higher + 1)
+ val countsArray = accumulatedCounts.map(_._2).toArray[Long]
+ val lowerIndex = binarySearchCount(countsArray, 0, accumulatedCounts.size,
lower + 1)
+ val higherIndex = binarySearchCount(countsArray, 0,
accumulatedCounts.size, higher + 1)
- val lowerKey = aggreCounts(lowerIndex)._1
+ val lowerKey = accumulatedCounts(lowerIndex)._1
if (higher == lower) {
// no interpolation needed because position does not have a fraction
return toDoubleValue(lowerKey)
}
- val higherKey = aggreCounts(higherIndex)._1
+ val higherKey = accumulatedCounts(higherIndex)._1
if (higherKey == lowerKey) {
// no interpolation needed because lower position and higher position
has the same key
return toDoubleValue(lowerKey)
}
- // Linear interpolation to get the exact percentile
- (higher - position) * toDoubleValue(lowerKey) + (position - lower) *
toDoubleValue(higherKey)
+ if (discrete) {
+ toDoubleValue(lowerKey)
+ } else {
+ // Linear interpolation to get the exact percentile
+ (higher - position) * toDoubleValue(lowerKey) + (position - lower) *
toDoubleValue(higherKey)
+ }
}
/**
* use a binary search to find the index of the position closest to the
current value.
*/
- private def binarySearchCount(
+ protected def binarySearchCount(
countsArray: Array[Long], start: Int, end: Int, value: Long): Int = {
util.Arrays.binarySearch(countsArray, 0, end, value) match {
case ix if ix < 0 => -(ix + 1)
@@ -268,12 +219,13 @@ case class Percentile(
}
}
+ private lazy val projection =
UnsafeProjection.create(Array[DataType](child.dataType, LongType))
+
override def serialize(obj: OpenHashMap[AnyRef, Long]): Array[Byte] = {
val buffer = new Array[Byte](4 << 10) // 4K
val bos = new ByteArrayOutputStream()
val out = new DataOutputStream(bos)
try {
- val projection = UnsafeProjection.create(Array[DataType](child.dataType,
LongType))
// Write pairs in counts map to byte buffer.
obj.foreach { case (key, count) =>
val row = InternalRow.apply(key, count)
@@ -316,6 +268,86 @@ case class Percentile(
bis.close()
}
}
+}
+
+/**
+ * The Percentile aggregate function returns the exact percentile(s) of
numeric column `expr` at
+ * the given percentage(s) with value range in [0.0, 1.0].
+ *
+ * Because the number of elements and their partial order cannot be determined
in advance.
+ * Therefore we have to store all the elements in memory, and so notice that
too many elements can
+ * cause GC paused and eventually OutOfMemory Errors.
+ *
+ * @param child child expression that produce numeric column value with
`child.eval(inputRow)`
+ * @param percentageExpression Expression that represents a single percentage
value or an array of
+ * percentage values. Each percentage value must
be in the range
+ * [0.0, 1.0].
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage =
+ """
+ _FUNC_(col, percentage [, frequency]) - Returns the exact percentile
value of numeric
+ or ansi interval column `col` at the given percentage. The value of
percentage must be
+ between 0.0 and 1.0. The value of frequency should be positive integral
+ _FUNC_(col, array(percentage1 [, percentage2]...) [, frequency]) -
Returns the exact
+ percentile value array of numeric column `col` at the given
percentage(s). Each value
+ of the percentage array must be between 0.0 and 1.0. The value of
frequency should be
+ positive integral
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_(col, 0.3) FROM VALUES (0), (10) AS tab(col);
+ 3.0
+ > SELECT _FUNC_(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS
tab(col);
+ [2.5,7.5]
+ > SELECT _FUNC_(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL
'10' MONTH) AS tab(col);
+ 5.0
+ > SELECT _FUNC_(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND),
(INTERVAL '10' SECOND) AS tab(col);
+ [2000000.0,5000000.0]
+ """,
+ group = "agg_funcs",
+ since = "2.1.0")
+// scalastyle:on line.size.limit
+case class Percentile(
+ child: Expression,
+ percentageExpression: Expression,
+ frequencyExpression : Expression,
+ mutableAggBufferOffset: Int = 0,
+ inputAggBufferOffset: Int = 0,
+ reverse: Boolean = false) extends PercentileBase with
TernaryLike[Expression] {
+
+ def this(child: Expression, percentageExpression: Expression) = {
+ this(child, percentageExpression, Literal(1L), 0, 0)
+ }
+
+ def this(child: Expression, percentageExpression: Expression, frequency:
Expression) = {
+ this(child, percentageExpression, frequency, 0, 0)
+ }
+
+ def this(child: Expression, percentageExpression: Expression, reverse:
Boolean) = {
+ this(child, percentageExpression, Literal(1L), reverse = reverse)
+ }
+
+ override def first: Expression = child
+ override def second: Expression = percentageExpression
+ override def third: Expression = frequencyExpression
+
+ override def prettyName: String = "percentile"
+
+ override def discrete: Boolean = false
+
+ override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int):
Percentile =
+ copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+ override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int):
Percentile =
+ copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+ override protected def stringArgs: Iterator[Any] = if (discrete) {
+ super.stringArgs ++ Some(discrete)
+ } else {
+ super.stringArgs
+ }
override protected def withNewChildrenInternal(
newFirst: Expression, newSecond: Expression, newThird: Expression):
Percentile = copy(
@@ -324,3 +356,70 @@ case class Percentile(
frequencyExpression = newThird
)
}
+
+/**
+ * Return a percentile value based on a continuous distribution of
+ * numeric or ansi interval column at the given percentage (specified in ORDER
BY clause).
+ * The value of percentage must be between 0.0 and 1.0.
+ */
+case class PercentileCont(left: Expression, right: Expression, reverse:
Boolean = false)
+ extends AggregateFunction
+ with RuntimeReplaceableAggregate
+ with ImplicitCastInputTypes
+ with BinaryLike[Expression] {
+ private lazy val percentile = new Percentile(left, right, reverse)
+ override def replacement: Expression = percentile
+ override def nodeName: String = "percentile_cont"
+ override def inputTypes: Seq[AbstractDataType] = percentile.inputTypes
+ override def sql(isDistinct: Boolean): String = {
+ val distinct = if (isDistinct) "DISTINCT " else ""
+ val direction = if (reverse) " DESC" else ""
+ s"$prettyName($distinct${right.sql}) WITHIN GROUP (ORDER BY v$direction)"
+ }
+ override protected def withNewChildrenInternal(
+ newLeft: Expression, newRight: Expression): PercentileCont =
+ this.copy(left = newLeft, right = newRight)
+}
+
+/**
+ * The Percentile aggregate function returns the percentile(s) based on a
discrete distribution of
+ * numeric column `expr` at the given percentage(s) with value range in [0.0,
1.0].
+ *
+ * Because the number of elements and their partial order cannot be determined
in advance.
+ * Therefore we have to store all the elements in memory, and so notice that
too many elements can
+ * cause GC paused and eventually OutOfMemory Errors.
+ */
+case class PercentileDisc(
+ child: Expression,
+ percentageExpression: Expression,
+ reverse: Boolean = false,
+ mutableAggBufferOffset: Int = 0,
+ inputAggBufferOffset: Int = 0) extends PercentileBase with
BinaryLike[Expression] {
+
+ val frequencyExpression: Expression = Literal(1L)
+
+ override def left: Expression = child
+ override def right: Expression = percentageExpression
+
+ override def prettyName: String = "percentile_disc"
+
+ override def discrete: Boolean = true
+
+ override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int):
PercentileDisc =
+ copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+ override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int):
PercentileDisc =
+ copy(inputAggBufferOffset = newInputAggBufferOffset)
+
+ override def sql(isDistinct: Boolean): String = {
+ val distinct = if (isDistinct) "DISTINCT " else ""
+ val direction = if (reverse) " DESC" else ""
+ s"$prettyName($distinct${right.sql}) WITHIN GROUP (ORDER BY v$direction)"
+ }
+
+ override protected def withNewChildrenInternal(
+ newLeft: Expression, newRight: Expression): PercentileDisc = copy(
+ child = newLeft,
+ percentageExpression = newRight
+ )
+}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index d334b5780f7..e788368604f 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier,
SQLConfHelper, TableId
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat}
import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last,
Percentile}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{First, Last,
PercentileCont, PercentileDisc}
import org.apache.spark.sql.catalyst.parser.SqlBaseParser._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
@@ -1836,11 +1836,26 @@ class AstBuilder extends
SqlBaseParserBaseVisitor[AnyRef] with SQLConfHelper wit
override def visitPercentile(ctx: PercentileContext): Expression =
withOrigin(ctx) {
val percentage = expression(ctx.percentage)
val sortOrder = visitSortItem(ctx.sortItem)
- val percentile = sortOrder.direction match {
- case Ascending => new Percentile(sortOrder.child, percentage)
- case Descending => new Percentile(sortOrder.child, Subtract(Literal(1),
percentage))
+ val percentile = ctx.name.getType match {
+ case SqlBaseParser.PERCENTILE_CONT =>
+ sortOrder.direction match {
+ case Ascending => PercentileCont(sortOrder.child, percentage)
+ case Descending => PercentileCont(sortOrder.child, percentage, true)
+ }
+ case SqlBaseParser.PERCENTILE_DISC =>
+ sortOrder.direction match {
+ case Ascending => PercentileDisc(sortOrder.child, percentage)
+ case Descending => PercentileDisc(sortOrder.child, percentage, true)
+ }
+ }
+ val aggregateExpression = percentile.toAggregateExpression()
+ ctx.windowSpec match {
+ case spec: WindowRefContext =>
+ UnresolvedWindowExpression(aggregateExpression, visitWindowRef(spec))
+ case spec: WindowDefContext =>
+ WindowExpression(aggregateExpression, visitWindowDef(spec))
+ case _ => aggregateExpression
}
- percentile.toAggregateExpression()
}
/**
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
index b5882b1ab40..7b85be05e3b 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/PercentileSuite.scala
@@ -83,8 +83,8 @@ class PercentileSuite extends SparkFunSuite {
}
private def runTest(agg: Percentile,
- rows : Seq[Seq[Any]],
- expectedPercentiles : Seq[Double]): Unit = {
+ rows : Seq[Seq[Any]],
+ expectedPercentiles : Seq[Double]): Unit = {
assert(agg.nullable)
val group1 = (0 until rows.length / 2)
val group1Buffer = agg.createAggregationBuffer()
@@ -218,7 +218,7 @@ class PercentileSuite extends SparkFunSuite {
val percentile2 = new Percentile(child, percentage)
assertEqual(percentile2.checkInputDataTypes(),
TypeCheckFailure(s"Percentage(s) must be between 0.0 and 1.0, " +
- s"but got ${percentage.simpleString(100)}"))
+ s"but got ${percentage.simpleString(100)}"))
}
val nonFoldablePercentage = Seq(NonFoldableLiteral(0.5),
@@ -270,7 +270,6 @@ class PercentileSuite extends SparkFunSuite {
}
test("nulls in percentage expression") {
-
assert(new Percentile(
AttributeReference("a", DoubleType)(),
percentageExpression = Literal(null, DoubleType)).checkInputDataTypes()
===
@@ -280,14 +279,13 @@ class PercentileSuite extends SparkFunSuite {
Seq(CreateArray(Seq(null).map(Literal(_))), CreateArray(Seq(0.1D,
null).map(Literal(_))))
nullPercentageExprs.foreach { percentageExpression =>
- val wrongPercentage = new Percentile(
- AttributeReference("a", DoubleType)(),
- percentageExpression = percentageExpression)
- assert(
- wrongPercentage.checkInputDataTypes() match {
- case TypeCheckFailure(msg) if msg.contains("argument 2 requires
array<double>") => true
- case _ => false
- })
+ val wrongPercentage = new Percentile(
+ AttributeReference("a", DoubleType)(),
+ percentageExpression = percentageExpression)
+ assert(wrongPercentage.checkInputDataTypes() match {
+ case TypeCheckFailure(msg) if msg.contains("argument 2 requires
array<double>") => true
+ case _ => false
+ })
}
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 3e2d917a893..688c0d12373 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.parser
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.analysis.{AnalysisTest,
RelationTimeTravel, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction,
UnresolvedGenerator, UnresolvedInlineTable, UnresolvedRelation, UnresolvedStar,
UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction}
import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.aggregate.Percentile
+import org.apache.spark.sql.catalyst.expressions.aggregate.{PercentileCont,
PercentileDisc}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.internal.SQLConf
@@ -1303,24 +1303,36 @@ class PlanParserSuite extends AnalysisTest {
"timestamp expression cannot contain subqueries")
}
- test("PERCENTILE_CONT function") {
- def assertPercentileContPlans(inputSQL: String, expectedExpression:
Expression): Unit = {
+ test("PERCENTILE_CONT & PERCENTILE_DISC") {
+ def assertPercentilePlans(inputSQL: String, expectedExpression:
Expression): Unit = {
comparePlans(
parsePlan(inputSQL),
Project(Seq(UnresolvedAlias(expectedExpression)), OneRowRelation())
)
}
- assertPercentileContPlans(
+ assertPercentilePlans(
"SELECT PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY col)",
- new Percentile(UnresolvedAttribute("col"), Literal(Decimal(0.1),
DecimalType(1, 1)))
+ PercentileCont(UnresolvedAttribute("col"), Literal(Decimal(0.1),
DecimalType(1, 1)))
.toAggregateExpression()
)
- assertPercentileContPlans(
+ assertPercentilePlans(
"SELECT PERCENTILE_CONT(0.1) WITHIN GROUP (ORDER BY col DESC)",
- new Percentile(UnresolvedAttribute("col"),
- Subtract(Literal(1), Literal(Decimal(0.1), DecimalType(1,
1)))).toAggregateExpression()
+ PercentileCont(UnresolvedAttribute("col"),
+ Literal(Decimal(0.1), DecimalType(1, 1)), true).toAggregateExpression()
+ )
+
+ assertPercentilePlans(
+ "SELECT PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY col)",
+ PercentileDisc(UnresolvedAttribute("col"), Literal(Decimal(0.1),
DecimalType(1, 1)))
+ .toAggregateExpression()
+ )
+
+ assertPercentilePlans(
+ "SELECT PERCENTILE_DISC(0.1) WITHIN GROUP (ORDER BY col DESC)",
+ PercentileDisc(UnresolvedAttribute("col"),
+ Literal(Decimal(0.1), DecimalType(1, 1)), true).toAggregateExpression()
)
}
}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index b97b24140ed..291a8478c7a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -272,8 +272,8 @@ SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT
NULL), avg(y) FILTER (
-- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
SELECT
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr;
SELECT
k,
@@ -282,3 +282,16 @@ SELECT
FROM aggr
GROUP BY k
ORDER BY k;
+
+-- SPARK-37691: Support ANSI Aggregation Function: percentile_disc
+SELECT
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
+FROM aggr;
+SELECT
+ k,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
+FROM aggr
+GROUP BY k
+ORDER BY k;
diff --git
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql
index 0d255bed24e..3cf57b1f0e5 100644
---
a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql
+++
b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql
@@ -33,10 +33,10 @@
-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1))
v(p)
-- group by p order by p;
--- select percentile_cont(0.5) within group (order by b) from aggtest;
--- select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
--- select percentile_cont(0.5) within group (order by thousand) from tenk1;
--- select percentile_disc(0.5) within group (order by thousand) from tenk1;
+select percentile_cont(0.5) within group (order by b) from aggtest;
+select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
+select percentile_cont(0.5) within group (order by thousand) from tenk1;
+select percentile_disc(0.5) within group (order by thousand) from tenk1;
-- [SPARK-28661] Hypothetical-Set Aggregate Functions
-- select rank(3) within group (order by x)
-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
diff --git
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql
index 8aea00073ee..dd14a7db0db 100644
---
a/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql
+++
b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql
@@ -30,10 +30,10 @@
-- (values (0::float8),(0.1),(0.25),(0.4),(0.5),(0.6),(0.75),(0.9),(1))
v(p)
-- group by p order by p;
--- select percentile_cont(0.5) within group (order by b) from aggtest;
--- select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
--- select percentile_cont(0.5) within group (order by thousand) from tenk1;
--- select percentile_disc(0.5) within group (order by thousand) from tenk1;
+select percentile_cont(0.5) within group (order by b) from aggtest;
+select percentile_cont(0.5) within group (order by b), sum(b) from aggtest;
+select percentile_cont(0.5) within group (order by thousand) from tenk1;
+select percentile_disc(0.5) within group (order by thousand) from tenk1;
-- [SPARK-28661] Hypothetical-Set Aggregate Functions
-- select rank(3) within group (order by x)
-- from (values (1),(1),(2),(2),(3),(3),(4)) v(x);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql
b/sql/core/src/test/resources/sql-tests/inputs/window.sql
index 666c0577f13..e982683250c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/window.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql
@@ -440,3 +440,115 @@ SELECT
SUM(salary) OVER w sum_salary
FROM
basic_pays;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department)
+FROM basic_pays
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ORDER BY salary),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ORDER BY salary)
+FROM basic_pays
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ORDER BY salary),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ORDER BY salary)
+FROM basic_pays
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)
+FROM basic_pays
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)
+FROM basic_pays
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department)
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER w,
+ percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WHERE salary > 8900
+WINDOW w AS (PARTITION BY department)
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ORDER BY salary)
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ORDER BY salary)
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING)
+ORDER BY salary;
+
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING)
+ORDER BY salary;
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 04d34972c30..ba06b148d2d 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 99
+-- Number of queries: 101
-- !query
@@ -974,11 +974,11 @@ struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y
IS NOT NULL))):double,
-- !query
SELECT
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
-- !query schema
-struct<percentile(v, 0.25, 1):double,percentile(v, (1 - 0.25), 1):double>
+struct<percentile_cont(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
-- !query output
10.0 30.0
@@ -992,10 +992,39 @@ FROM aggr
GROUP BY k
ORDER BY k
-- !query schema
-struct<k:int,percentile(v, 0.25, 1):double,percentile(v, (1 - 0.25), 1):double>
+struct<k:int,percentile_cont(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
-- !query output
0 10.0 30.0
1 12.5 17.5
2 17.5 26.25
3 60.0 60.0
4 NULL NULL
+
+
+-- !query
+SELECT
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
+FROM aggr
+-- !query schema
+struct<percentile_disc(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
+-- !query output
+10.0 30.0
+
+
+-- !query
+SELECT
+ k,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
+FROM aggr
+GROUP BY k
+ORDER BY k
+-- !query schema
+struct<k:int,percentile_disc(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
+-- !query output
+0 10.0 30.0
+1 10.0 20.0
+2 10.0 30.0
+3 60.0 60.0
+4 NULL NULL
diff --git
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out
index b7bbdc50dfb..8c21a5067bf 100644
---
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out
@@ -1,5 +1,34 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 0
+-- Number of queries: 4
+-- !query
+select percentile_cont(0.5) within group (order by b) from aggtest
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+53.44850015640259
+
+-- !query
+select percentile_cont(0.5) within group (order by b), sum(b) from aggtest
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double,sum(b):double>
+-- !query output
+53.44850015640259 431.77260909229517
+
+
+-- !query
+select percentile_cont(0.5) within group (order by thousand) from tenk1
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+499.5
+
+
+-- !query
+select percentile_disc(0.5) within group (order by thousand) from tenk1
+-- !query schema
+struct<percentile_disc(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+499.0
diff --git
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out
index b7bbdc50dfb..8c21a5067bf 100644
---
a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out
+++
b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out
@@ -1,5 +1,34 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 0
+-- Number of queries: 4
+-- !query
+select percentile_cont(0.5) within group (order by b) from aggtest
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+53.44850015640259
+
+-- !query
+select percentile_cont(0.5) within group (order by b), sum(b) from aggtest
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double,sum(b):double>
+-- !query output
+53.44850015640259 431.77260909229517
+
+
+-- !query
+select percentile_cont(0.5) within group (order by thousand) from tenk1
+-- !query schema
+struct<percentile_cont(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+499.5
+
+
+-- !query
+select percentile_disc(0.5) within group (order by thousand) from tenk1
+-- !query schema
+struct<percentile_disc(0.5) WITHIN GROUP (ORDER BY v):double>
+-- !query output
+499.0
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out
b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index d13411e3333..dcce285c309 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 55
+-- Number of queries: 66
-- !query
@@ -1197,3 +1197,230 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Window specification w is not defined in the WINDOW clause.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department)
+FROM basic_pays
+ORDER BY salary
+-- !query schema
+struct<employee_name:string,department:string,salary:int,percentile_cont(0.25)
WITHIN GROUP (ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED
PRECEDING AND UNBOUNDED FOLLOWING):double,percentile_disc(0.25) WITHIN GROUP
(ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v
DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING):double,p [...]
+-- !query output
+Leslie Thompson IT 5186 5917.75 5186.0 7381.25 8113.0
+Anthony Bow Accounting 6627 8543.75 8435.0 9746.5 9998.0
+Foon Yue Tseng Sales 6660 8550.75 6660.0 9721.5 10563.0
+Gerard Hernandez SCM 6949 10449.0 10449.0 11303.0 11303.0
+Leslie Jennings IT 8113 5917.75 5186.0 7381.25 8113.0
+Diane Murphy Accounting 8435 8543.75 8435.0 9746.5 9998.0
+William Patterson Accounting 8870 8543.75 8435.0 9746.5 9998.0
+Jeff Firrelli Accounting 8992 8543.75 8435.0 9746.5 9998.0
+Julie Firrelli Sales 9181 8550.75 6660.0 9721.5 10563.0
+Steve Patterson Sales 9441 8550.75 6660.0 9721.5 10563.0
+Mary Patterson Accounting 9998 8543.75 8435.0 9746.5 9998.0
+Loui Bondur SCM 10449 10449.0 10449.0 11303.0 11303.0
+George Vanauf Sales 10563 8550.75 6660.0 9721.5 10563.0
+Barry Jones SCM 10586 10449.0 10449.0 11303.0 11303.0
+Pamela Castillo SCM 11303 10449.0 10449.0 11303.0 11303.0
+Gerard Bondur Accounting 11472 8543.75 8435.0 9746.5 9998.0
+Larry Bott SCM 11798 10449.0 10449.0 11303.0 11303.0
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ORDER BY salary),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ORDER BY salary)
+FROM basic_pays
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_cont'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ORDER BY salary),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ORDER BY salary)
+FROM basic_pays
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_disc'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING),
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)
+FROM basic_pays
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_cont'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER (PARTITION BY
department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING),
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER (PARTITION
BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING)
+FROM basic_pays
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_disc'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department)
+ORDER BY salary
+-- !query schema
+struct<employee_name:string,department:string,salary:int,percentile_cont(0.25)
WITHIN GROUP (ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED
PRECEDING AND UNBOUNDED FOLLOWING):double,percentile_disc(0.25) WITHIN GROUP
(ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v
DESC) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING):double,p [...]
+-- !query output
+Leslie Thompson IT 5186 5917.75 5186.0 7381.25 8113.0
+Anthony Bow Accounting 6627 8543.75 8435.0 9746.5 9998.0
+Foon Yue Tseng Sales 6660 8550.75 6660.0 9721.5 10563.0
+Gerard Hernandez SCM 6949 10449.0 10449.0 11303.0 11303.0
+Leslie Jennings IT 8113 5917.75 5186.0 7381.25 8113.0
+Diane Murphy Accounting 8435 8543.75 8435.0 9746.5 9998.0
+William Patterson Accounting 8870 8543.75 8435.0 9746.5 9998.0
+Jeff Firrelli Accounting 8992 8543.75 8435.0 9746.5 9998.0
+Julie Firrelli Sales 9181 8550.75 6660.0 9721.5 10563.0
+Steve Patterson Sales 9441 8550.75 6660.0 9721.5 10563.0
+Mary Patterson Accounting 9998 8543.75 8435.0 9746.5 9998.0
+Loui Bondur SCM 10449 10449.0 10449.0 11303.0 11303.0
+George Vanauf Sales 10563 8550.75 6660.0 9721.5 10563.0
+Barry Jones SCM 10586 10449.0 10449.0 11303.0 11303.0
+Pamela Castillo SCM 11303 10449.0 10449.0 11303.0 11303.0
+Gerard Bondur Accounting 11472 8543.75 8435.0 9746.5 9998.0
+Larry Bott SCM 11798 10449.0 10449.0 11303.0 11303.0
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.5) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER w,
+ percentile_disc(0.5) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WHERE salary > 8900
+WINDOW w AS (PARTITION BY department)
+ORDER BY salary
+-- !query schema
+struct<employee_name:string,department:string,salary:int,percentile_cont(0.5)
WITHIN GROUP (ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED
PRECEDING AND UNBOUNDED FOLLOWING):double,percentile_disc(0.5) WITHIN GROUP
(ORDER BY v) OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING):double,percentile_cont(0.5) WITHIN GROUP (ORDER BY v DESC)
OVER (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING):double,perc [...]
+-- !query output
+Jeff Firrelli Accounting 8992 9998.0 9998.0 9998.0 9998.0
+Julie Firrelli Sales 9181 9441.0 9441.0 9441.0 9441.0
+Steve Patterson Sales 9441 9441.0 9441.0 9441.0 9441.0
+Mary Patterson Accounting 9998 9998.0 9998.0 9998.0 9998.0
+Loui Bondur SCM 10449 10944.5 10586.0 10944.5 11303.0
+George Vanauf Sales 10563 9441.0 9441.0 9441.0 9441.0
+Barry Jones SCM 10586 10944.5 10586.0 10944.5 11303.0
+Pamela Castillo SCM 11303 10944.5 10586.0 10944.5 11303.0
+Gerard Bondur Accounting 11472 9998.0 9998.0 9998.0 9998.0
+Larry Bott SCM 11798 10944.5 10586.0 10944.5 11303.0
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ORDER BY salary)
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_cont'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ORDER BY salary)
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_disc'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING)
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_cont'.
+
+
+-- !query
+SELECT
+ employee_name,
+ department,
+ salary,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) OVER w,
+ percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) OVER w
+FROM basic_pays
+WINDOW w AS (PARTITION BY department ROWS BETWEEN UNBOUNDED PRECEDING AND 1
FOLLOWING)
+ORDER BY salary
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Cannot specify order by or frame for 'percentile_disc'.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]