(datafusion) branch main updated: feat: Support tpch and tpch10 csv format (#16373)

alamb Wed, 11 Jun 2025 23:44:20 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 31c570e3ee feat: Support tpch and tpch10 csv format (#16373)
31c570e3ee is described below

commit 31c570e3ee7fa830753b2bbab3ec1a635ef16a30
Author: Qi Zhu <821684...@qq.com>
AuthorDate: Thu Jun 12 12:33:25 2025 +0800

    feat: Support tpch and tpch10 csv format (#16373)
---
 benchmarks/bench.sh        | 33 ++++++++++++++++++++++++++++-----
 benchmarks/src/tpch/run.rs |  2 +-
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
index b34c646c5b..837e8c6c9b 100755
--- a/benchmarks/bench.sh
+++ b/benchmarks/bench.sh
@@ -78,8 +78,10 @@ venv:            Creates new venv (unless already exists) 
and installs compare's
 **********
 all(default): Data/Run/Compare for all benchmarks
 tpch:                   TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), 
single parquet file per table, hash join
+tpch_csv:               TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), 
single csv file per table, hash join
 tpch_mem:               TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), 
query from memory
 tpch10:                 TPCH inspired benchmark on Scale Factor (SF) 10 
(~10GB), single parquet file per table, hash join
+tpch_csv10:             TPCH inspired benchmark on Scale Factor (SF) 10 
(~10GB), single csv file per table, hash join
 tpch_mem10:             TPCH inspired benchmark on Scale Factor (SF) 10 
(~10GB), query from memory
 cancellation:           How long cancelling a query takes
 parquet:                Benchmark of parquet reader's filtering speed
@@ -266,9 +268,11 @@ main() {
             mkdir -p "${DATA_DIR}"
             case "$BENCHMARK" in
                 all)
-                    run_tpch "1"
+                    run_tpch "1" "parquet"
+                    run_tpch "1" "csv"
                     run_tpch_mem "1"
-                    run_tpch "10"
+                    run_tpch "10" "parquet"
+                    run_tpch "10" "csv"
                     run_tpch_mem "10"
                     run_cancellation
                     run_parquet
@@ -286,13 +290,19 @@ main() {
                     run_external_aggr
                     ;;
                 tpch)
-                    run_tpch "1"
+                    run_tpch "1" "parquet"
+                    ;;
+                tpch_csv)
+                    run_tpch "1" "csv"
                     ;;
                 tpch_mem)
                     run_tpch_mem "1"
                     ;;
                 tpch10)
-                    run_tpch "10"
+                    run_tpch "10" "parquet"
+                    ;;
+                tpch_csv10)
+                    run_tpch "10" "csv"
                     ;;
                 tpch_mem10)
                     run_tpch_mem "10"
@@ -430,6 +440,17 @@ data_tpch() {
         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output 
"${TPCH_DIR}" --format parquet
         popd > /dev/null
     fi
+
+    # Create 'csv' files from tbl
+    FILE="${TPCH_DIR}/csv/supplier"
+    if test -d "${FILE}"; then
+        echo " csv files exist ($FILE exists)."
+    else
+        echo " creating csv files using benchmark binary ..."
+        pushd "${SCRIPT_DIR}" > /dev/null
+        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output 
"${TPCH_DIR}/csv" --format csv
+        popd > /dev/null
+    fi
 }
 
 # Runs the tpch benchmark
@@ -446,7 +467,9 @@ run_tpch() {
     echo "Running tpch benchmark..."
     # Optional query filter to run specific query
     QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "")
-    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 
--path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet 
-o "${RESULTS_FILE}" $QUERY
+
+    FORMAT=$2
+    debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 
--path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format 
${FORMAT} -o "${RESULTS_FILE}" $QUERY
 }
 
 # Runs the tpch in memory
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
index 20867991f2..88960d7c7d 100644
--- a/benchmarks/src/tpch/run.rs
+++ b/benchmarks/src/tpch/run.rs
@@ -274,7 +274,7 @@ impl RunOpt {
                     (Arc::new(format), path, ".tbl")
                 }
                 "csv" => {
-                    let path = format!("{path}/{table}");
+                    let path = format!("{path}/csv/{table}");
                     let format = CsvFormat::default()
                         .with_delimiter(b',')
                         .with_has_header(true);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org
For additional commands, e-mail: commits-h...@datafusion.apache.org

(datafusion) branch main updated: feat: Support tpch and tpch10 csv format (#16373)

Reply via email to