This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 22102931a90 [SPARK-39856][SQL][TESTS] Increase the number of
partitions in TPC-DS build to avoid out-of-memory
22102931a90 is described below
commit 22102931a9044f688ad5ff03c241d5deee17dfe1
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Mon Jul 25 12:44:54 2022 +0900
[SPARK-39856][SQL][TESTS] Increase the number of partitions in TPC-DS build
to avoid out-of-memory
This PR proposes to avoid out-of-memory in TPC-DS build at GitHub Actions
CI by:
- Increasing the number of partitions being used in shuffle.
- Truncating precisions after 10th in floats.
The number of partitions was previously set to 1 because of different
results in precisions that generally we can just ignore.
- Sort the results regardless of join type since Apache Spark does not
guarantee the order of results
One of the reasons for the large memory usage seems to be single partition
that's being used in the shuffle.
No, test-only.
GitHub Actions in this CI will test it out.
Closes #37270 from HyukjinKwon/deflake-tpcds.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
(cherry picked from commit 7358253755762f9bfe6cedc1a50ec14616cfeace)
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
index 670048826e3..a81e7bcb8c7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQueryTestSuite.scala
@@ -58,7 +58,7 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
// To make output results deterministic
protected override def sparkConf: SparkConf = super.sparkConf
- .set(SQLConf.SHUFFLE_PARTITIONS.key, "1")
+ .set(SQLConf.SHUFFLE_PARTITIONS.key, 4.toString)
protected override def createSparkSession: TestSparkSession = {
new TestSparkSession(new SparkContext("local[1]",
this.getClass.getSimpleName, sparkConf))
@@ -106,7 +106,9 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
private def runQuery(query: String, goldenFile: File): Unit = {
val (schema, output) = handleExceptions(getNormalizedResult(spark, query))
val queryString = query.trim
- val outputString = output.mkString("\n").replaceAll("\\s+$", "")
+ val outputString = output.mkString("\n")
+ .replaceAll("\\s+$", "")
+ .replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""", "$1")
if (regenerateGoldenFiles) {
val goldenOutput = {
s"-- Automatically generated by ${getClass.getSimpleName}\n\n" +
@@ -133,7 +135,8 @@ class TPCDSQueryTestSuite extends QueryTest with TPCDSBase
with SQLQueryTestHelp
s"Expected 3 blocks in result file but got ${segments.size}. " +
"Try regenerate the result files.")
- (segments(1).trim, segments(2).replaceAll("\\s+$", ""))
+ (segments(1).trim, segments(2)
+ .replaceAll("\\s+$", "").replaceAll("""([0-9]+.[0-9]{10})([0-9]*)""",
"$1"))
}
assertResult(expectedSchema, s"Schema did not match\n$queryString") {
schema }
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]