Repository: spark Updated Branches: refs/heads/master 623fc7fc6 -> cb80edc26
[SPARK-18111][SQL] Wrong ApproximatePercentile answer when multiple records have the minimum value ## What changes were proposed in this pull request? When multiple records have the minimum value, the answer of ApproximatePercentile is wrong. ## How was this patch tested? add a test case Author: wangzhenhua <wangzhen...@huawei.com> Closes #15641 from wzhfy/percentile. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cb80edc2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cb80edc2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cb80edc2 Branch: refs/heads/master Commit: cb80edc26349e2e358d27fe2ae8e5d6959b77fab Parents: 623fc7f Author: wangzhenhua <wangzhen...@huawei.com> Authored: Tue Nov 1 13:11:24 2016 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Tue Nov 1 13:11:24 2016 +0000 ---------------------------------------------------------------------- .../spark/sql/catalyst/util/QuantileSummaries.scala | 4 +++- .../spark/sql/ApproximatePercentileQuerySuite.scala | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/cb80edc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala index 27928c4..04f4ff2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala @@ -264,7 +264,9 @@ object QuantileSummaries { res.prepend(head) // If necessary, add the minimum element: val currHead = currentSamples.head - if (currHead.value < head.value) { + // don't add the minimum element if `currentSamples` has only one element (both `currHead` and + // `head` point to the same element) + if (currHead.value <= head.value && currentSamples.length > 1) { res.prepend(currentSamples.head) } res.toArray http://git-wip-us.apache.org/repos/asf/spark/blob/cb80edc2/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index 37d7c44..e98092d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -64,6 +64,17 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSQLContext { } } + test("percentile_approx, multiple records with the minimum value in a partition") { + withTempView(table) { + spark.sparkContext.makeRDD(Seq(1, 1, 2, 1, 1, 3, 1, 1, 4, 1, 1, 5), 4).toDF("col") + .createOrReplaceTempView(table) + checkAnswer( + spark.sql(s"SELECT percentile_approx(col, array(0.5)) FROM $table"), + Row(Seq(1.0D)) + ) + } + } + test("percentile_approx, with different accuracies") { withTempView(table) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org