This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 785958e0d [VL] Gluten-it: Add option --scan-partitions (#5958)
785958e0d is described below
commit 785958e0d3693cf7529237119a6c918d8639a833
Author: Hongze Zhang <[email protected]>
AuthorDate: Tue Jun 4 10:42:49 2024 +0800
[VL] Gluten-it: Add option --scan-partitions (#5958)
---
.../org/apache/gluten/integration/BaseMixin.java | 12 ++++++------
.../scala/org/apache/gluten/integration/Suite.scala | 21 +++++++++++----------
.../integration/clickbench/ClickBenchSuite.scala | 4 ++--
.../apache/gluten/integration/ds/TpcdsSuite.scala | 4 ++--
.../org/apache/gluten/integration/h/TpchSuite.scala | 4 ++--
5 files changed, 23 insertions(+), 22 deletions(-)
diff --git
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
index 41d244871..dc1691e50 100644
---
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
+++
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java
@@ -72,11 +72,11 @@ public class BaseMixin {
@CommandLine.Option(names = {"--disable-wscg"}, description = "Disable Spark
SQL whole stage code generation", defaultValue = "false")
private boolean disableWscg;
- @CommandLine.Option(names = {"--shuffle-partitions"}, description =
"Generate data with partitions", defaultValue = "100")
+ @CommandLine.Option(names = {"--shuffle-partitions"}, description = "Shuffle
partition number", defaultValue = "100")
private int shufflePartitions;
- @CommandLine.Option(names = {"--min-scan-partitions"}, description = "Use
minimum number of partitions to read data", defaultValue = "false")
- private boolean minimumScanPartitions;
+ @CommandLine.Option(names = {"--scan-partitions"}, description = "Scan
partition number. This is an approximate value, so the actual scan partition
number might vary around this value", defaultValue = "100")
+ private int scanPartitions;
@CommandLine.Option(names = {"--extra-conf"}, description = "Extra Spark
config entries applying to generated Spark session. E.g. --extra-conf=k1=v1
--extra-conf=k2=v2")
private Map<String, String> extraSparkConf = Collections.emptyMap();
@@ -131,19 +131,19 @@ public class BaseMixin {
suite = new TpchSuite(runModeEnumeration.getSparkMasterUrl(), actions,
testConf,
baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi,
enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj,
- disableWscg, shufflePartitions, minimumScanPartitions);
+ disableWscg, shufflePartitions, scanPartitions);
break;
case "ds":
suite = new TpcdsSuite(runModeEnumeration.getSparkMasterUrl(),
actions, testConf,
baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi,
enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj,
- disableWscg, shufflePartitions, minimumScanPartitions);
+ disableWscg, shufflePartitions, scanPartitions);
break;
case "clickbench":
suite = new ClickBenchSuite(runModeEnumeration.getSparkMasterUrl(),
actions, testConf,
baselineConf, extraSparkConfScala, level, errorOnMemLeak, enableUi,
enableHsUi, hsUiPort, offHeapSize, disableAqe, disableBhj,
- disableWscg, shufflePartitions, minimumScanPartitions);
+ disableWscg, shufflePartitions, scanPartitions);
break;
default:
throw new IllegalArgumentException("TPC benchmark type not found: " +
benchmarkType);
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala
index 9e31e1171..bb5cb1889 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/Suite.scala
@@ -43,7 +43,7 @@ abstract class Suite(
private val disableBhj: Boolean,
private val disableWscg: Boolean,
private val shufflePartitions: Int,
- private val minimumScanPartitions: Boolean) {
+ private val scanPartitions: Int) {
resetLogLevel()
@@ -103,15 +103,16 @@ abstract class Suite(
sessionSwitcher.defaultConf().setWarningOnOverriding("spark.sql.codegen.wholeStage",
"false")
}
- if (minimumScanPartitions) {
- sessionSwitcher
- .defaultConf()
- .setWarningOnOverriding("spark.sql.files.maxPartitionBytes",
s"${ByteUnit.PiB.toBytes(1L)}")
- sessionSwitcher
- .defaultConf()
- .setWarningOnOverriding("spark.sql.files.openCostInBytes",
s"${ByteUnit.PiB.toBytes(1L)}")
-
sessionSwitcher.defaultConf().setWarningOnOverriding("spark.default.parallelism",
"1")
- }
+ // Scan partition number.
+ sessionSwitcher
+ .defaultConf()
+ .setWarningOnOverriding("spark.sql.files.maxPartitionBytes",
s"${ByteUnit.PiB.toBytes(1L)}")
+ sessionSwitcher
+ .defaultConf()
+ .setWarningOnOverriding("spark.sql.files.openCostInBytes", "0")
+ sessionSwitcher
+ .defaultConf()
+ .setWarningOnOverriding("spark.sql.files.minPartitionNum",
s"${(scanPartitions - 1) max 1}")
extraSparkConf.toStream.foreach { kv =>
sessionSwitcher.defaultConf().setWarningOnOverriding(kv._1, kv._2)
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala
index deffdb7e5..04a34d332 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchSuite.scala
@@ -43,7 +43,7 @@ class ClickBenchSuite(
val disableBhj: Boolean,
val disableWscg: Boolean,
val shufflePartitions: Int,
- val minimumScanPartitions: Boolean)
+ val scanPartitions: Int)
extends Suite(
masterUrl,
actions,
@@ -60,7 +60,7 @@ class ClickBenchSuite(
disableBhj,
disableWscg,
shufflePartitions,
- minimumScanPartitions) {
+ scanPartitions) {
import ClickBenchSuite._
override protected def historyWritePath(): String = HISTORY_WRITE_PATH
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
index 339e89d5b..a4365afde 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala
@@ -42,7 +42,7 @@ class TpcdsSuite(
val disableBhj: Boolean,
val disableWscg: Boolean,
val shufflePartitions: Int,
- val minimumScanPartitions: Boolean)
+ val scanPartitions: Int)
extends Suite(
masterUrl,
actions,
@@ -59,7 +59,7 @@ class TpcdsSuite(
disableBhj,
disableWscg,
shufflePartitions,
- minimumScanPartitions) {
+ scanPartitions) {
override protected def historyWritePath(): String = HISTORY_WRITE_PATH
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
index 29c299bee..bdcac3bff 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala
@@ -38,7 +38,7 @@ class TpchSuite(
val disableBhj: Boolean,
val disableWscg: Boolean,
val shufflePartitions: Int,
- val minimumScanPartitions: Boolean)
+ val scanPartitions: Int)
extends Suite(
masterUrl,
actions,
@@ -55,7 +55,7 @@ class TpchSuite(
disableBhj,
disableWscg,
shufflePartitions,
- minimumScanPartitions) {
+ scanPartitions) {
override protected def historyWritePath(): String = HISTORY_WRITE_PATH
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]