Repository: spark Updated Branches: refs/heads/master 4fe7c7bd1 -> b85d18f3b
[SPARK-15709][SQL] Prevent `freqItems` from raising `UnsupportedOperationException: empty.min` ## What changes were proposed in this pull request? Currently, `freqItems` raises `UnsupportedOperationException` on `empty.min` usually when its `support` argument is high. ```scala scala> spark.createDataset(Seq(1, 2, 2, 3, 3, 3)).stat.freqItems(Seq("value"), 2) 16/06/01 11:11:38 ERROR Executor: Exception in task 5.0 in stage 0.0 (TID 5) java.lang.UnsupportedOperationException: empty.min ... ``` Also, the parameter checking message is wrong. ``` require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") ``` This PR changes the logic to handle the `empty` case and also improves parameter checking. ## How was this patch tested? Pass the Jenkins tests (with a new testcase). Author: Dongjoon Hyun <dongj...@apache.org> Closes #13449 from dongjoon-hyun/SPARK-15709. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b85d18f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b85d18f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b85d18f3 Branch: refs/heads/master Commit: b85d18f3bdedca7ae7f2c26ff64ce38c2796bd63 Parents: 4fe7c7b Author: Dongjoon Hyun <dongj...@apache.org> Authored: Thu Jun 2 11:12:17 2016 -0500 Committer: Sean Owen <so...@cloudera.com> Committed: Thu Jun 2 11:12:17 2016 -0500 ---------------------------------------------------------------------- .../apache/spark/sql/execution/stat/FrequentItems.scala | 4 ++-- .../scala/org/apache/spark/sql/DataFrameStatSuite.scala | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b85d18f3/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala index 34bd243..b19344f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/FrequentItems.scala @@ -40,7 +40,7 @@ private[sql] object FrequentItems extends Logging { if (baseMap.size < size) { baseMap += key -> count } else { - val minCount = baseMap.values.min + val minCount = if (baseMap.values.isEmpty) 0 else baseMap.values.min val remainder = count - minCount if (remainder >= 0) { baseMap += key -> count // something will get kicked out, so we can add this @@ -83,7 +83,7 @@ private[sql] object FrequentItems extends Logging { df: DataFrame, cols: Seq[String], support: Double): DataFrame = { - require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") + require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt http://git-wip-us.apache.org/repos/asf/spark/blob/b85d18f3/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index ab7733b..73026c7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -235,6 +235,17 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { assert(items.length === 1) } + test("SPARK-15709: Prevent `UnsupportedOperationException: empty.min` in `freqItems`") { + val ds = spark.createDataset(Seq(1, 2, 2, 3, 3, 3)) + + intercept[IllegalArgumentException] { + ds.stat.freqItems(Seq("value"), 0) + } + intercept[IllegalArgumentException] { + ds.stat.freqItems(Seq("value"), 2) + } + } + test("sampleBy") { val df = spark.range(0, 100).select((col("id") % 3).as("key")) val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org