This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 0a27d0c6e8e [SPARK-39856][SQL][TESTS] Increase the number of
partitions in TPC-DS build to avoid out-of-memory
0a27d0c6e8e is described below
commit 0a27d0c6e8e705176f0f245794bc8361860ac680
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Mon Jul 25 12:44:54 2022 +0900
[SPARK-39856][SQL][TESTS] Increase the number of partitions in TPC-DS build
to avoid out-of-memory
This PR proposes to avoid out-of-memory in TPC-DS build at GitHub Actions
CI by:
- Increasing the number of partitions being used in shuffle.
- Truncating precisions after 10th in floats.
The number of partitions was previously set to 1 because of different
results in precisions that generally we can just ignore.
- Sort the results regardless of join type since Apache Spark does not
guarantee the order of results
One of the reasons for the large memory usage seems to be single partition
that's being used in the shuffle.
No, test-only.
GitHub Actions in this CI will test it out.
Closes #37270 from HyukjinKwon/deflake-tpcds.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 7358253755762f9bfe6cedc1a50ec14616cfeace)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
index c16bcd9fd05..7fb4b567b1a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
@@ -58,7 +58,7 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
// To make output results deterministic
protected override def sparkConf: SparkConf = super.sparkConf
- .set(SQLConf.SHUFFLE_PARTITIONS.key, "1")
+ .set(SQLConf.SHUFFLE_PARTITIONS.key, 4.toString)
protected override def createSparkSession: TestSparkSession = {
new TestSparkSession(new SparkContext("local[1]",
this.getClass.getSimpleName, sparkConf))
@@ -103,7 +103,9 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
private def runQuery(query: String, goldenFile: File): Unit = {
val (schema, output) = handleExceptions(getNormalizedResult(spark, query))
val queryString = query.trim
- val outputString = output.mkString("\n").replaceAll("\\s+$", "")
+ val outputString = output.mkString("\n")
+ .replaceAll("\\s+$", "")
+ .replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""", "$1")
if (regenerateGoldenFiles) {
val goldenOutput = {
s"-- Automatically generated by ${getClass.getSimpleName}\n\n" +
@@ -130,7 +132,8 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
s"Expected 3 blocks in result file but got ${segments.size}. " +
"Try regenerate the result files.")
- (segments(1).trim, segments(2).replaceAll("\\s+$", ""))
+ (segments(1).trim, segments(2)
+ .replaceAll("\\s+$", "").replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""",
"$1"))
}
assertResult(expectedSchema, s"Schema did not match\n$queryString") {
schema }
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]