This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 82341de10b [MINOR] Improve logs for TPC-H/DS (#11384)
82341de10b is described below
commit 82341de10b8d9a592b199cfa2fce9d3feedb3765
Author: litao <[email protected]>
AuthorDate: Fri Jan 9 22:58:29 2026 +0800
[MINOR] Improve logs for TPC-H/DS (#11384)
---
docs/developers/HowTo.md | 2 +-
tools/workload/tpcds-delta/run_tpcds/tpcds_delta.scala | 6 +++---
.../tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.scala | 8 +++++++-
tools/workload/tpcds/run_tpcds/tpcds_parquet.scala | 6 +++---
.../tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala | 7 ++++++-
tools/workload/tpch/run_tpch/tpch_parquet.scala | 6 +++---
6 files changed, 23 insertions(+), 12 deletions(-)
diff --git a/docs/developers/HowTo.md b/docs/developers/HowTo.md
index ee75b1d230..63d94c7400 100644
--- a/docs/developers/HowTo.md
+++ b/docs/developers/HowTo.md
@@ -195,7 +195,7 @@ Here we will explain how to run TPC-H on Velox backend with
the Parquet file for
# How to run TPC-DS
-wait to add
+Please refer to `${GLUTEN_HOME}/tools/workload/tpcds/README.md`.
# How to track the memory exhaust problem
diff --git a/tools/workload/tpcds-delta/run_tpcds/tpcds_delta.scala
b/tools/workload/tpcds-delta/run_tpcds/tpcds_delta.scala
index 8dd9f27ce5..b9d13b4c5c 100644
--- a/tools/workload/tpcds-delta/run_tpcds/tpcds_delta.scala
+++ b/tools/workload/tpcds-delta/run_tpcds/tpcds_delta.scala
@@ -30,11 +30,11 @@ var delta_file_root = "/ROOT_PATH"
var tpcds_queries_path =
"/tools/gluten-it/common/src/main/resources/tpcds-queries/"
-def time[R](block: => R): R = {
+def time[R](fileName: String)(block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
- println("Elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
+ println(s"$fileName, elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
result
}
@@ -91,7 +91,7 @@ for (t <- sorted) {
println(t)
val fileContents =
Source.fromFile(t).getLines.filter(!_.startsWith("--")).mkString(" ")
println(fileContents)
- time{spark.sql(fileContents).collectAsList()}
+ time(t.getName){spark.sql(fileContents).collectAsList()}
// spark.sql(fileContents).explain
Thread.sleep(2000)
}
diff --git
a/tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.scala
b/tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.scala
index 6ccc559bc4..8f6206911c 100644
--- a/tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.scala
+++ b/tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.scala
@@ -22,7 +22,13 @@ val numPartitions = 200 // how many dsdgen partitions to
run - number of input
val format = "parquet" // valid spark format like parquet "parquet".
val rootDir = "/PATH/TO/TPCDS_PARQUET_PATH" // root directory of location to
create data in.
-val dsdgenDir = "/PATH/TO/TPCDS_DBGEN" // location of dbgen
+
+/**
+ * Location of dbgen.
+ * Please compile with https://github.com/databricks/tpcds-kit.git.
+ * Other tpcds-kit may cause errors, see discussion:
https://github.com/databricks/spark-sql-perf/issues/126.
+ */
+val dsdgenDir = "/PATH/TO/TPCDS_DBGEN"
val tables = new TPCDSTables(spark.sqlContext,
dsdgenDir = dsdgenDir,
diff --git a/tools/workload/tpcds/run_tpcds/tpcds_parquet.scala
b/tools/workload/tpcds/run_tpcds/tpcds_parquet.scala
index 5ed9712bc4..10e1c0a909 100644
--- a/tools/workload/tpcds/run_tpcds/tpcds_parquet.scala
+++ b/tools/workload/tpcds/run_tpcds/tpcds_parquet.scala
@@ -30,11 +30,11 @@ var paq_file_root = "/ROOT_PATH"
var tpcds_queries_path =
"/tools/gluten-it/common/src/main/resources/tpcds-queries/"
-def time[R](block: => R): R = {
+def time[R](fileName: String)(block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
- println("Elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
+ println(s"$fileName, elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
result
}
@@ -117,7 +117,7 @@ for (t <- sorted) {
println(t)
val fileContents =
Source.fromFile(t).getLines.filter(!_.startsWith("--")).mkString(" ")
println(fileContents)
- time{spark.sql(fileContents).collectAsList()}
+ time(t.getName){spark.sql(fileContents).collectAsList()}
// spark.sql(fileContents).explain
Thread.sleep(2000)
}
diff --git
a/tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala
b/tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala
index 4539050491..a3e6668ea8 100644
--- a/tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala
+++ b/tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala
@@ -24,7 +24,12 @@ val numPartitions = 200 // how many dsdgen partitions to
run - number of input
val format = "parquet" // valid spark format like parquet "parquet".
val rootDir = "/PATH/TO/TPCH_PARQUET_PATH" // root directory of location to
create data in.
-val dbgenDir = "/PATH/TO/TPCH_DBGEN" // location of dbgen
+
+/**
+ * Location of dbgen.
+ * Please compile with https://github.com/databricks/tpch-dbgen.git.
+ */
+val dbgenDir = "/PATH/TO/TPCH_DBGEN"
val tables = new TPCHTables(spark.sqlContext,
dbgenDir = dbgenDir,
diff --git a/tools/workload/tpch/run_tpch/tpch_parquet.scala
b/tools/workload/tpch/run_tpch/tpch_parquet.scala
index 2bb5004e45..dc66f77b16 100644
--- a/tools/workload/tpch/run_tpch/tpch_parquet.scala
+++ b/tools/workload/tpch/run_tpch/tpch_parquet.scala
@@ -24,11 +24,11 @@ import sys.process._
var parquet_file_path = "/PATH/TO/TPCH_PARQUET_PATH"
var gluten_root = "/PATH/TO/GLUTEN"
-def time[R](block: => R): R = {
+def time[R](fileName: String)(block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
- println("Elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
+ println(s"$fileName, elapsed time: " + (t1 - t0)/1000000000.0 + " seconds")
result
}
@@ -88,6 +88,6 @@ for (t <- sorted) {
}
println(t)
println(fileContents)
- time{spark.sql(fileContents).collectAsList()}
+ time(t.getName){spark.sql(fileContents).collectAsList()}
Thread.sleep(2000)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]