This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new 31c570e3ee feat: Support tpch and tpch10 csv format (#16373) 31c570e3ee is described below commit 31c570e3ee7fa830753b2bbab3ec1a635ef16a30 Author: Qi Zhu <821684...@qq.com> AuthorDate: Thu Jun 12 12:33:25 2025 +0800 feat: Support tpch and tpch10 csv format (#16373) --- benchmarks/bench.sh | 33 ++++++++++++++++++++++++++++----- benchmarks/src/tpch/run.rs | 2 +- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index b34c646c5b..837e8c6c9b 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -78,8 +78,10 @@ venv: Creates new venv (unless already exists) and installs compare's ********** all(default): Data/Run/Compare for all benchmarks tpch: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single parquet file per table, hash join +tpch_csv: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), single csv file per table, hash join tpch_mem: TPCH inspired benchmark on Scale Factor (SF) 1 (~1GB), query from memory tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single parquet file per table, hash join +tpch_csv10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), single csv file per table, hash join tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory cancellation: How long cancelling a query takes parquet: Benchmark of parquet reader's filtering speed @@ -266,9 +268,11 @@ main() { mkdir -p "${DATA_DIR}" case "$BENCHMARK" in all) - run_tpch "1" + run_tpch "1" "parquet" + run_tpch "1" "csv" run_tpch_mem "1" - run_tpch "10" + run_tpch "10" "parquet" + run_tpch "10" "csv" run_tpch_mem "10" run_cancellation run_parquet @@ -286,13 +290,19 @@ main() { run_external_aggr ;; tpch) - run_tpch "1" + run_tpch "1" "parquet" + ;; + tpch_csv) + run_tpch "1" "csv" ;; tpch_mem) run_tpch_mem "1" ;; tpch10) - run_tpch "10" + run_tpch "10" "parquet" + ;; + tpch_csv10) + run_tpch "10" "csv" ;; tpch_mem10) run_tpch_mem "10" @@ -430,6 +440,17 @@ data_tpch() { $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet popd > /dev/null fi + + # Create 'csv' files from tbl + FILE="${TPCH_DIR}/csv/supplier" + if test -d "${FILE}"; then + echo " csv files exist ($FILE exists)." + else + echo " creating csv files using benchmark binary ..." + pushd "${SCRIPT_DIR}" > /dev/null + $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}/csv" --format csv + popd > /dev/null + fi } # Runs the tpch benchmark @@ -446,7 +467,9 @@ run_tpch() { echo "Running tpch benchmark..." # Optional query filter to run specific query QUERY=$([ -n "$ARG3" ] && echo "--query $ARG3" || echo "") - debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" $QUERY + + FORMAT=$2 + debug_run $CARGO_COMMAND --bin tpch -- benchmark datafusion --iterations 5 --path "${TPCH_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format ${FORMAT} -o "${RESULTS_FILE}" $QUERY } # Runs the tpch in memory diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs index 20867991f2..88960d7c7d 100644 --- a/benchmarks/src/tpch/run.rs +++ b/benchmarks/src/tpch/run.rs @@ -274,7 +274,7 @@ impl RunOpt { (Arc::new(format), path, ".tbl") } "csv" => { - let path = format!("{path}/{table}"); + let path = format!("{path}/csv/{table}"); let format = CsvFormat::default() .with_delimiter(b',') .with_has_header(true); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org