This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 5581a92 [SPARK-32908][SQL] Fix target error calculation in
`percentile_approx()`
5581a92 is described below
commit 5581a92bb8b1d96278796e6c18812c5a2931db11
Author: Max Gekk <[email protected]>
AuthorDate: Fri Sep 18 10:47:06 2020 +0900
[SPARK-32908][SQL] Fix target error calculation in `percentile_approx()`
### What changes were proposed in this pull request?
1. Change the target error calculation according to the paper
[Space-Efficient Online Computation of Quantile
Summaries](http://infolab.stanford.edu/~datar/courses/cs361a/papers/quantiles.pdf).
It says that the error `e = max(gi, deltai)/2` (see the page 59). Also this
has clear explanation [ε-approximate
quantiles](http://www.mathcs.emory.edu/~cheung/Courses/584/Syllabus/08-Quantile/Greenwald.html#proofprop1).
2. Added a test to check different accuracies.
3. Added an input CSV file `percentile_approx-input.csv.bz2` to the
resource folder `sql/catalyst/src/main/resources` for the test.
### Why are the changes needed?
To fix incorrect percentile calculation, see an example in SPARK-32908.
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
- By running existing tests in `QuantileSummariesSuite` and in
`ApproximatePercentileQuerySuite`.
- Added new test `SPARK-32908: maximum target error in percentile_approx`
to `ApproximatePercentileQuerySuite`.
Closes #29784 from MaxGekk/fix-percentile_approx-2.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
(cherry picked from commit 75dd86400c3c2348a4139586fbbead840512b909)
Signed-off-by: HyukjinKwon <[email protected]>
---
.../sql/catalyst/util/QuantileSummaries.scala | 2 +-
.../test-data/percentile_approx-input.csv.bz2 | Bin 0 -> 124614 bytes
.../sql/ApproximatePercentileQuerySuite.scala | 21 ++++++++++++++++++++-
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index 3a0490d..e07aa0f 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -254,7 +254,7 @@ class QuantileSummaries(
// Target rank
val rank = math.ceil(quantile * count).toLong
- val targetError = relativeError * count
+ val targetError = sampled.map(s => s.delta + s.g).max / 2
// Minimum rank at current sample
var minRank = 0L
var i = 0
diff --git
a/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2
b/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2
new file mode 100644
index 0000000..f85e289
Binary files /dev/null and
b/sql/core/src/test/resources/test-data/percentile_approx-input.csv.bz2 differ
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
index 2b4abed..4991e39 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala
@@ -150,7 +150,7 @@ class ApproximatePercentileQuerySuite extends QueryTest
with SharedSparkSession
(1 to 1000).toDF("col").createOrReplaceTempView(table)
checkAnswer(
spark.sql(s"SELECT percentile_approx(col, array(0.25 + 0.25D), 200 +
800) FROM $table"),
- Row(Seq(499))
+ Row(Seq(500))
)
}
}
@@ -296,4 +296,23 @@ class ApproximatePercentileQuerySuite extends QueryTest
with SharedSparkSession
buffer.quantileSummaries
assert(buffer.isCompressed)
}
+
+ test("SPARK-32908: maximum target error in percentile_approx") {
+ withTempView(table) {
+ spark.read
+ .schema("col int")
+ .csv(testFile("test-data/percentile_approx-input.csv.bz2"))
+ .repartition(1)
+ .createOrReplaceTempView(table)
+ checkAnswer(
+ spark.sql(
+ s"""SELECT
+ | percentile_approx(col, 0.77, 1000),
+ | percentile_approx(col, 0.77, 10000),
+ | percentile_approx(col, 0.77, 100000),
+ | percentile_approx(col, 0.77, 1000000)
+ |FROM $table""".stripMargin),
+ Row(18, 17, 17, 17))
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]