10110346 closed pull request #23304: [SPARK-26353][SQL]Add typed aggregate
functions: max&&min
URL: https://github.com/apache/spark/pull/23304
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
index b6550bf3e4aac..2d08ea3fce6fb 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/typedaggregators.scala
@@ -99,3 +99,71 @@ class TypedAverage[IN](val f: IN => Double) extends
Aggregator[IN, (Double, Long
toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
}
}
+
+class TypedMaxDouble[IN](val f: IN => Double) extends Aggregator[IN, Double,
Double] {
+ override def zero: Double = Double.MinValue
+ override def reduce(b: Double, a: IN): Double = if (b > f(a)) b else f(a)
+ override def merge(b1: Double, b2: Double): Double = if (b1 > b2) b1 else b2
+ override def finish(reduction: Double): Double = reduction
+
+ override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
+ override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()
+
+ // Java api support
+ def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) =>
f.call(x).asInstanceOf[Double])
+
+ def toColumnJava: TypedColumn[IN, java.lang.Double] = {
+ toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
+ }
+}
+
+class TypedMaxLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
+ override def zero: Long = Long.MinValue
+ override def reduce(b: Long, a: IN): Long = if (b > f(a)) b else f(a)
+ override def merge(b1: Long, b2: Long): Long = if (b1 > b2) b1 else b2
+ override def finish(reduction: Long): Long = reduction
+
+ override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
+ override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()
+
+ // Java api support
+ def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) =>
f.call(x).asInstanceOf[Long])
+
+ def toColumnJava: TypedColumn[IN, java.lang.Long] = {
+ toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
+ }
+}
+
+class TypedMinDouble[IN](val f: IN => Double) extends Aggregator[IN, Double,
Double] {
+ override def zero: Double = Double.MaxValue
+ override def reduce(b: Double, a: IN): Double = if (b < f(a)) b else f(a)
+ override def merge(b1: Double, b2: Double): Double = if (b1 < b2) b1 else b2
+ override def finish(reduction: Double): Double = reduction
+
+ override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
+ override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()
+
+ // Java api support
+ def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) =>
f.call(x).asInstanceOf[Double])
+
+ def toColumnJava: TypedColumn[IN, java.lang.Double] = {
+ toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
+ }
+}
+
+class TypedMinLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
+ override def zero: Long = Long.MaxValue
+ override def reduce(b: Long, a: IN): Long = if (b < f(a)) b else f(a)
+ override def merge(b1: Long, b2: Long): Long = if (b1 < b2) b1 else b2
+ override def finish(reduction: Long): Long = reduction
+
+ override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
+ override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()
+
+ // Java api support
+ def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) =>
f.call(x).asInstanceOf[Long])
+
+ def toColumnJava: TypedColumn[IN, java.lang.Long] = {
+ toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
+ }
+}
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
index 1cb579c4faa76..6a8336e01d6f6 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/expressions/scalalang/typed.scala
@@ -77,14 +77,31 @@ object typed {
*/
def sumLong[IN](f: IN => Long): TypedColumn[IN, Long] = new
TypedSumLong[IN](f).toColumn
+ /**
+ * Max aggregate function for floating point (double) type.
+ */
+ def max[IN](f: IN => Double): TypedColumn[IN, Double] = new
TypedMaxDouble[IN](f).toColumn
+
+ /**
+ * Max aggregate function for integral (long, i.e. 64 bit integer) type.
+ */
+ def maxLong[IN](f: IN => Long): TypedColumn[IN, Long] = new
TypedMaxLong[IN](f).toColumn
+
+ /**
+ * Min aggregate function for floating point (double) type.
+ */
+ def min[IN](f: IN => Double): TypedColumn[IN, Double] = new
TypedMinDouble[IN](f).toColumn
+
+ /**
+ * Min aggregate function for integral (long, i.e. 64 bit integer) type.
+ */
+ def minLong[IN](f: IN => Long): TypedColumn[IN, Long] = new
TypedMinLong[IN](f).toColumn
+
// TODO:
// stddevOf: Double
// varianceOf: Double
// approx_count_distinct: Long
- // minOf: T
- // maxOf: T
-
// firstOf: T
// lastOf: T
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
index 97c3f358c0e76..1eee4826f1eef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala
@@ -323,6 +323,14 @@ class DatasetAggregatorSuite extends QueryTest with
SharedSQLContext {
("a", 2.0, 2L, 4.0, 4L), ("b", 3.0, 1L, 3.0, 3L))
}
+ test("typed aggregate: max, min") {
+ val ds = Seq("a" -> 4, "a" -> -1, "b" -> -3, "b" -> 1, "b" -> -1).toDS()
+ checkDataset(
+ ds.groupByKey(_._1).agg(
+ typed.max(_._2), typed.maxLong(_._2), typed.min(_._2),
typed.minLong(_._2)),
+ ("a", 4.0, 4L, -1.0, -1L), ("b", 1.0, 1L, -3.0, -3L))
+ }
+
test("generic typed sum") {
val ds = Seq("a" -> 1, "a" -> 3, "b" -> 3).toDS()
checkDataset(
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]