martin-g commented on code in PR #19035:
URL: https://github.com/apache/datafusion/pull/19035#discussion_r2579852158


##########
benchmarks/bench.sh:
##########
@@ -611,10 +611,10 @@ run_tpch() {
     echo "Running tpch benchmark..."
 
     FORMAT=$2

Review Comment:
   ```suggestion
       FORMAT=${2:-parquet}
   ```



##########
benchmarks/bench.sh:
##########
@@ -548,20 +544,19 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2

Review Comment:
   ```suggestion
       FORMAT=${2:-parquet}
   ```



##########
benchmarks/bench.sh:
##########
@@ -574,27 +569,32 @@ data_tpch() {
         docker run -v "${TPCH_DIR}":/data -it --entrypoint /bin/bash --rm 
ghcr.io/scalytics/tpch-docker:main  -c "cp -f 
/opt/tpch/2.18.0_rc2/dbgen/answers/* /data/answers/"
     fi
 
-    # Create 'parquet' files from tbl
-    FILE="${TPCH_DIR}/supplier"
-    if test -d "${FILE}"; then
-        echo " parquet files exist ($FILE exists)."
-    else
-        echo " creating parquet files using benchmark binary ..."
-        pushd "${SCRIPT_DIR}" > /dev/null
-        $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output 
"${TPCH_DIR}" --format parquet
-        popd > /dev/null
+    if [ "$FORMAT" = "parquet" ]; then
+      # Create 'parquet' files, one directory per file
+      FILE="${TPCH_DIR}/supplier"
+      if test -d "${FILE}"; then
+          echo " parquet files exist ($FILE exists)."
+      else
+          echo " creating parquet files using tpchgen-cli ..."
+          tpchgen-cli --scale-factor "${SCALE_FACTOR}" --format parquet 
--parquet-compression='ZSTD(1)' --parts=1 --output-dir "${TPCH_DIR}"

Review Comment:
   Should the compression type be a parameter as well ?
   E.g. `COMPRESSION=${3:-ZSTD(1)}` ?



##########
benchmarks/bench.sh:
##########
@@ -548,20 +544,19 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2
 
     TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-    echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in 
${TPCH_DIR}..."
+    echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in 
${TPCH_DIR}..."
 
     # Ensure the target data directory exists
     mkdir -p "${TPCH_DIR}"
 
-    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already 
exist
-    FILE="${TPCH_DIR}/supplier.tbl"
-    if test -f "${FILE}"; then
-        echo " tbl files exist ($FILE exists)."
-    else
-        echo " creating tbl files with tpch_dbgen..."
-        docker run -v "${TPCH_DIR}":/data -it --rm 
ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
+    # check if tpchgen_cli is installed
+    if ! command -v tpchgen-cli &> /dev/null
+    then
+        echo "tpchgen_cli could not be found, please install it via 'cargo 
install tpchgen-cli'"

Review Comment:
   ```suggestion
           echo "tpchgen-cli could not be found, please install it via 'cargo 
install tpchgen-cli'"
   ```



##########
benchmarks/README.md:
##########
@@ -246,25 +246,8 @@ You can enable `mimalloc` or `snmalloc` (to use either the 
mimalloc or snmalloc
 cargo run --release --features "mimalloc" --bin tpch -- benchmark datafusion 
--iterations 3 --path ./data --format tbl --query 1 --batch-size 4096

Review Comment:
   The `tpch` binary is removed with this PR.



##########
benchmarks/bench.sh:
##########
@@ -548,20 +544,19 @@ data_tpch() {
         echo "Internal error: Scale factor not specified"
         exit 1
     fi
+    FORMAT=$2
 
     TPCH_DIR="${DATA_DIR}/tpch_sf${SCALE_FACTOR}"
-    echo "Creating tpch dataset at Scale Factor ${SCALE_FACTOR} in 
${TPCH_DIR}..."
+    echo "Creating tpch $FORMAT dataset at Scale Factor ${SCALE_FACTOR} in 
${TPCH_DIR}..."
 
     # Ensure the target data directory exists
     mkdir -p "${TPCH_DIR}"
 
-    # Create 'tbl' (CSV format) data into $DATA_DIR if it does not already 
exist
-    FILE="${TPCH_DIR}/supplier.tbl"
-    if test -f "${FILE}"; then
-        echo " tbl files exist ($FILE exists)."
-    else
-        echo " creating tbl files with tpch_dbgen..."
-        docker run -v "${TPCH_DIR}":/data -it --rm 
ghcr.io/scalytics/tpch-docker:main -vf -s "${SCALE_FACTOR}"
+    # check if tpchgen_cli is installed

Review Comment:
   ```suggestion
       # check if tpchgen-cli is installed
   ```



##########
benchmarks/bench.sh:
##########


Review Comment:
   Here the format is not specified.
   This is why I think `FORMAT=${2:-parquet}` is needed below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to