This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new a4cafeee1 [VL] Gluten-it: --data-gen-strategy=once to skip data-gen
when it already exists (#6587)
a4cafeee1 is described below
commit a4cafeee129fc8bc06733994f1bbebf0d008a85b
Author: Hongze Zhang <[email protected]>
AuthorDate: Fri Jul 26 10:02:05 2024 +0800
[VL] Gluten-it: --data-gen-strategy=once to skip data-gen when it already
exists (#6587)
---
.github/workflows/velox_be.yml.deprecated | 12 ++++-----
.github/workflows/velox_docker.yml | 18 +++++++-------
.../gluten/integration/command/DataGenMixin.java | 24 +++++++++++++-----
.../gluten/integration/action/DataGenOnly.scala | 29 ++++++++++++++++++++--
4 files changed, 60 insertions(+), 23 deletions(-)
diff --git a/.github/workflows/velox_be.yml.deprecated
b/.github/workflows/velox_be.yml.deprecated
index d095af64d..6ff5ec743 100644
--- a/.github/workflows/velox_be.yml.deprecated
+++ b/.github/workflows/velox_be.yml.deprecated
@@ -529,9 +529,9 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd
/opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries \
- --local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --skip-data-gen
--random-kill-tasks \
+ --local --preset=velox --benchmark-type=h --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --data-gen-strategy=skip
--random-kill-tasks \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries \
- --local --preset=velox --benchmark-type=ds --error-on-memleak
--off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --skip-data-gen
--random-kill-tasks'
+ --local --preset=velox --benchmark-type=ds --error-on-memleak
--off-heap-size=50g -s=30.0 --threads=32 --iterations=1
--data-gen-strategy=skip --random-kill-tasks'
- name: Exit docker container
if: ${{ always() }}
run: |
@@ -580,7 +580,7 @@ jobs:
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
@@ -592,7 +592,7 @@ jobs:
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
@@ -603,7 +603,7 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd
/opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1
\
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
@@ -615,7 +615,7 @@ jobs:
$PATH_TO_GLUTEN_TE/$OS_IMAGE_NAME/gha/gha-checkout/exec.sh 'cd
/opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
diff --git a/.github/workflows/velox_docker.yml
b/.github/workflows/velox_docker.yml
index 1e88e034e..47dd7a919 100644
--- a/.github/workflows/velox_docker.yml
+++ b/.github/workflows/velox_docker.yml
@@ -296,7 +296,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
@@ -308,7 +308,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q67 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
@@ -319,7 +319,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:6g,spark.memory.offHeap.size=6g \
-d=OFFHEAP_SIZE:4g,spark.memory.offHeap.size=4g \
@@ -330,7 +330,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1
\
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
\
@@ -341,7 +341,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1
\
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
\
@@ -352,7 +352,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
- --skip-data-gen -m=OffHeapExecutionMemory \
+ --data-gen-strategy=skip -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1
\
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
@@ -408,7 +408,7 @@ jobs:
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=ds --error-on-memleak
-s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1
\
- --skip-data-gen --random-kill-tasks --no-session-reuse
+ --data-gen-strategy=skip --random-kill-tasks --no-session-reuse
# run-tpc-test-ubuntu-sf30:
# needs: build-native-lib-centos-7
@@ -457,10 +457,10 @@ jobs:
# cd tools/gluten-it \
# && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
# --local --preset=velox --benchmark-type=h --error-on-memleak
-s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1
\
- # --skip-data-gen --shard=${{ matrix.shard }} \
+ # --data-gen-strategy=skip --shard=${{ matrix.shard }} \
# && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
# --local --preset=velox --benchmark-type=ds --error-on-memleak
-s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1
\
- # --skip-data-gen --shard=${{ matrix.shard }}
+ # --data-gen-strategy=skip --shard=${{ matrix.shard }}
run-tpc-test-centos8-uniffle:
needs: build-native-lib-centos-7
diff --git
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java
index 0682f5601..3854d078e 100644
---
a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java
+++
b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/DataGenMixin.java
@@ -17,23 +17,35 @@
package org.apache.gluten.integration.command;
import org.apache.gluten.integration.action.Action;
+import org.apache.gluten.integration.action.DataGenOnly;
import picocli.CommandLine;
public class DataGenMixin {
+ @CommandLine.Option(names = {"--data-gen-strategy"}, description = "The
strategy of data generation, accepted values: skip, once, always", defaultValue
= "always")
+ private String dataGenStrategy;
+
@CommandLine.Option(names = {"-s", "--scale"}, description = "The scale
factor of sample TPC-H dataset", defaultValue = "0.1")
private double scale;
@CommandLine.Option(names = {"--gen-partitioned-data"}, description =
"Generate data with partitions", defaultValue = "false")
private boolean genPartitionedData;
- @CommandLine.Option(names = {"--skip-data-gen"}, description = "Skip data
generation", defaultValue = "false")
- private boolean skipDataGen;
-
public Action[] makeActions() {
- if (skipDataGen) {
- return new Action[0];
+ final DataGenOnly.Strategy strategy;
+ switch (dataGenStrategy) {
+ case "skip":
+ strategy = DataGenOnly.Skip$.MODULE$;
+ break;
+ case "once":
+ strategy = DataGenOnly.Once$.MODULE$;
+ break;
+ case "always":
+ strategy = DataGenOnly.Always$.MODULE$;
+ break;
+ default:
+ throw new IllegalArgumentException("Unexpected data-gen strategy: " +
dataGenStrategy);
}
- return new Action[]{new
org.apache.gluten.integration.action.DataGenOnly(scale, genPartitionedData)};
+ return new Action[]{new
org.apache.gluten.integration.action.DataGenOnly(strategy, scale,
genPartitionedData)};
}
public double getScale() {
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala
index bc4383461..dc54e9737 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/action/DataGenOnly.scala
@@ -20,11 +20,36 @@ import org.apache.gluten.integration.Suite
import java.io.File
-case class DataGenOnly(scale: Double, genPartitionedData: Boolean) extends
Action {
+case class DataGenOnly(strategy: DataGenOnly.Strategy, scale: Double,
genPartitionedData: Boolean)
+ extends Action {
override def execute(suite: Suite): Boolean = {
+ strategy match {
+ case DataGenOnly.Skip =>
+ // Do nothing
+ case DataGenOnly.Once =>
+ val dataPath = suite.dataWritePath(scale, genPartitionedData)
+ val alreadyExists = new File(dataPath).exists()
+ if (alreadyExists) {
+ println(s"Data already exists at $dataPath, skipping generating it.")
+ } else {
+ gen(suite)
+ }
+ case DataGenOnly.Always =>
+ gen(suite)
+ }
+ true
+ }
+
+ private def gen(suite: Suite): Unit = {
suite.sessionSwitcher.useSession("baseline", "Data Gen")
val dataGen = suite.createDataGen(scale, genPartitionedData)
dataGen.gen()
- true
}
}
+
+object DataGenOnly {
+ sealed trait Strategy
+ case object Skip extends Strategy
+ case object Once extends Strategy
+ case object Always extends Strategy
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]