MaxGekk commented on code in PR #35041:
URL: https://github.com/apache/spark/pull/35041#discussion_r841665761
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala:
##########
@@ -325,3 +339,154 @@ case class Percentile(
frequencyExpression = newThird
)
}
+
+/**
+ * Return a percentile value based on a continuous distribution of
+ * numeric or ansi interval column at the given percentage (specified in ORDER
BY clause).
+ * The value of percentage must be between 0.0 and 1.0.
+ */
+case class PercentileCont(left: Expression, right: Expression)
+ extends AggregateFunction
+ with RuntimeReplaceableAggregate
+ with ImplicitCastInputTypes
+ with BinaryLike[Expression] {
+ private lazy val percentile = new Percentile(left, right)
+ override def replacement: Expression = percentile
+ override def nodeName: String = "percentile_cont"
+ override def inputTypes: Seq[AbstractDataType] = percentile.inputTypes
+ override protected def withNewChildrenInternal(
+ newLeft: Expression, newRight: Expression): PercentileCont =
+ this.copy(left = newLeft, right = newRight)
+}
+
+/**
+ * The Percentile aggregate function returns the percentile(s) based on a
discrete distribution of
+ * numeric column `expr` at the given percentage(s) with value range in [0.0,
1.0].
+ *
+ * Because the number of elements and their partial order cannot be determined
in advance.
+ * Therefore we have to store all the elements in memory, and so notice that
too many elements can
+ * cause GC paused and eventually OutOfMemory Errors.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+ usage =
+ """
+ _FUNC_(percentage) WITHIN GROUP (ORDER BY col) - Returns the
percentile(s) based on a
+ discrete distribution of numeric column `col` at the given
percentage(s) with value range
+ in [0.0, 1.0].
+ """,
+ examples = """
+ Examples:
+ > SELECT _FUNC_(0.3) WITHIN GROUP (ORDER BY col) FROM VALUES (0), (10)
AS tab(col);
+ 0
+ > SELECT _FUNC_(0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10'
MONTH) AS tab(col);
+ 0
Review Comment:
The output is not checked by ExpressionInfoSuite, in fact.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala:
##########
@@ -226,10 +226,11 @@ trait CheckAnalysis extends PredicateHelper with
LookupCatalog {
// Only allow window functions with an aggregate expression or an
offset window
// function or a Pandas window UDF.
w.windowFunction match {
- case AggregateExpression(_: PercentileCont, _, _, _, _)
+ case AggregateExpression(_: PercentileCont | _: PercentileDisc,
_, _, _, _)
if w.windowSpec.orderSpec.nonEmpty ||
w.windowSpec.frameSpecification !=
SpecifiedWindowFrame(RowFrame, UnboundedPreceding,
UnboundedFollowing) =>
- failAnalysis("Cannot specify order by or frame for
'PERCENTILE_CONT'.")
+ failAnalysis(
+ "Cannot specify order by or frame for 'PERCENTILE_CONT' or
'PERCENTILE_DISC'.")
Review Comment:
Can't we exactly say which one causes the fail? What does `prettyName()`
return?
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/percentiles.scala:
##########
@@ -325,3 +339,154 @@ case class Percentile(
frequencyExpression = newThird
)
}
+
+/**
+ * Return a percentile value based on a continuous distribution of
+ * numeric or ansi interval column at the given percentage (specified in ORDER
BY clause).
+ * The value of percentage must be between 0.0 and 1.0.
+ */
+case class PercentileCont(left: Expression, right: Expression)
+ extends AggregateFunction
+ with RuntimeReplaceableAggregate
+ with ImplicitCastInputTypes
+ with BinaryLike[Expression] {
+ private lazy val percentile = new Percentile(left, right)
+ override def replacement: Expression = percentile
+ override def nodeName: String = "percentile_cont"
+ override def inputTypes: Seq[AbstractDataType] = percentile.inputTypes
+ override protected def withNewChildrenInternal(
+ newLeft: Expression, newRight: Expression): PercentileCont =
+ this.copy(left = newLeft, right = newRight)
+}
+
+/**
+ * The Percentile aggregate function returns the percentile(s) based on a
discrete distribution of
+ * numeric column `expr` at the given percentage(s) with value range in [0.0,
1.0].
+ *
+ * Because the number of elements and their partial order cannot be determined
in advance.
+ * Therefore we have to store all the elements in memory, and so notice that
too many elements can
+ * cause GC paused and eventually OutOfMemory Errors.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
Review Comment:
`PercentileDisc` is not bound to any function name in `FunctionRegistry`,
so, this is useless, I guess.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]